diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 3e78e3fae..3fbae8149 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] os: [ ubuntu-latest, macos-latest, windows-latest ] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3a93e3074..23267219a 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -3,6 +3,8 @@ on: create: tags: - 'v*' + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: jobs: build: if: ${{ startsWith(github.ref, 'refs/tags/v') }} @@ -98,7 +100,6 @@ jobs: matrix: python-version: - cp39 - - pp38 - pp39 - pp310 build: @@ -138,7 +139,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.12' - run: rustup target add aarch64-apple-darwin x86_64-apple-darwin if: matrix.build == 'macos' @@ -149,12 +150,13 @@ jobs: - name: Build sdist if: matrix.build == 'linux' && matrix.python-version == 'cp39' run: | - pip install maturin==1.7.1 + pip install maturin==1.7.4 maturin sdist --manifest-path py/Cargo.toml -o wheelhouse - name: Build ${{ matrix.platform || matrix.os }} binaries run: cibuildwheel --output-dir wheelhouse py env: + CIBW_ENABLE: 'pypy' CIBW_BUILD: '${{ matrix.python-version }}-*' # wasmtime doesn't support i686 CIBW_SKIP: '*_i686 *-musllinux* *-win32' @@ -173,6 +175,8 @@ jobs: CIBW_TEST_SKIP: '*-macosx_arm64 *-macosx_universal2:arm64' CIBW_BUILD_VERBOSITY: 1 + MACOSX_DEPLOYMENT_TARGET: '10.12' + - name: Upload artifacts uses: actions/upload-artifact@v4 with: @@ -244,7 +248,7 @@ jobs: path: dist - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@release/v1.12 with: #repository-url: https://test.pypi.org/legacy/ skip-existing: true diff --git a/Cargo.lock b/Cargo.lock index b1db82a61..bba491ac7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2981,9 +2981,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.210" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] @@ -3001,9 +3001,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -4358,7 +4358,7 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yara-x" -version = "0.10.0" +version = "0.11.1" dependencies = [ "aho-corasick", "annotate-snippets", @@ -4429,7 +4429,7 @@ dependencies = [ [[package]] name = "yara-x-capi" -version = "0.10.0" +version = "0.11.1" dependencies = [ "cbindgen", "serde_json", @@ -4438,7 +4438,7 @@ dependencies = [ [[package]] name = "yara-x-cli" -version = "0.10.0" +version = "0.11.1" dependencies = [ "anyhow", "ascii_tree", @@ -4472,7 +4472,7 @@ dependencies = [ [[package]] name = "yara-x-fmt" -version = "0.10.0" +version = "0.11.1" dependencies = [ "bitmask", "bstr", @@ -4487,7 +4487,7 @@ dependencies = [ [[package]] name = "yara-x-macros" -version = "0.10.0" +version = "0.11.1" dependencies = [ "darling", "proc-macro2", @@ -4497,7 +4497,7 @@ dependencies = [ [[package]] name = "yara-x-parser" -version = "0.10.0" +version = "0.11.1" dependencies = [ "anyhow", "ascii_tree", @@ -4521,7 +4521,7 @@ dependencies = [ [[package]] name = "yara-x-proto" -version = "0.10.0" +version = "0.11.1" dependencies = [ "protobuf", "protobuf-codegen", @@ -4530,7 +4530,7 @@ dependencies = [ [[package]] name = "yara-x-proto-yaml" -version = "0.10.0" +version = "0.11.1" dependencies = [ "chrono", "globwalk", @@ -4545,7 +4545,7 @@ dependencies = [ [[package]] name = "yara-x-py" -version = "0.10.0" +version = "0.11.1" dependencies = [ "protobuf-json-mapping", "pyo3", diff --git a/Cargo.toml b/Cargo.toml index 2446e2ca0..c15b9e385 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace.package] -version = "0.10.0" +version = "0.11.1" authors = ["Victor M. Alvarez "] edition = "2021" homepage = "https://virustotal.github.io/yara-x" @@ -89,8 +89,8 @@ regex-automata = "0.4.7" roxmltree = "0.20.0" rsa = "0.9.6" rustc-hash = "2.0.0" -serde = "=1.0.210" -serde_json = "1.0.132" +serde = "1.0.215" +serde_json = "1.0.133" sha1 = "0.10.6" sha2 = "0.10.8" smallvec = "1.13.2" @@ -105,11 +105,11 @@ x509-parser = "0.16.0" yaml-rust = "0.4.5" yansi = "1.0.1" yara-x = { path = "lib" } -yara-x-fmt = { path = "fmt", version = "0.10.0" } -yara-x-macros = { path = "macros", version = "0.10.0" } -yara-x-parser = { path = "parser", version = "0.10.0" } -yara-x-proto = { path = "proto", version = "0.10.0" } -yara-x-proto-yaml = { path = "proto-yaml", version = "0.10.0" } +yara-x-fmt = { path = "fmt", version = "0.11.1" } +yara-x-macros = { path = "macros", version = "0.11.1" } +yara-x-parser = { path = "parser", version = "0.11.1" } +yara-x-proto = { path = "proto", version = "0.11.1" } +yara-x-proto-yaml = { path = "proto-yaml", version = "0.11.1" } zip = "2.1.1" # Special profile that builds a release binary with link-time optimization. diff --git a/capi/Cargo.toml b/capi/Cargo.toml index b130b6a4e..afec7f4f0 100644 --- a/capi/Cargo.toml +++ b/capi/Cargo.toml @@ -12,7 +12,7 @@ homepage.workspace = true [features] # The `capi` feature is required by `cargo-c`. -default = ["capi", "rules-profiling"] +default = ["capi"] capi = [] # When enabled, the serialization of compiled rules include native code for @@ -29,7 +29,7 @@ native-code-serialization = ["yara-x/native-code-serialization"] # Enables rules profiling. # -# This feature is enabled by default. +# This feature is disabled by default. rules-profiling = ["yara-x/rules-profiling"] diff --git a/capi/include/yara_x.h b/capi/include/yara_x.h index 652899701..919137888 100644 --- a/capi/include/yara_x.h +++ b/capi/include/yara_x.h @@ -80,6 +80,9 @@ typedef enum YRX_RESULT { SERIALIZATION_ERROR, // An error returned when a rule doesn't have any metadata. NO_METADATA, + // An error returned in cases where some API is not supported because the + // library was not built with the required features. + NOT_SUPPORTED, } YRX_RESULT; // A compiler that takes YARA source code and produces compiled rules. @@ -708,7 +711,7 @@ enum YRX_RESULT yrx_scanner_set_global_float(struct YRX_SCANNER *scanner, // Iterates over the top N most expensive rules, calling the callback for // each rule. // -// Requires the `rules-profiling` feature. +// Requires the `rules-profiling` feature, otherwise the // // See [`YRX_MOST_EXPENSIVE_RULES_CALLBACK`] for more details. enum YRX_RESULT yrx_scanner_iter_most_expensive_rules(struct YRX_SCANNER *scanner, diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 1442820d5..c52d28fe2 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -152,6 +152,9 @@ pub enum YRX_RESULT { SERIALIZATION_ERROR, /// An error returned when a rule doesn't have any metadata. NO_METADATA, + /// An error returned in cases where some API is not supported because the + /// library was not built with the required features. + NOT_SUPPORTED, } /// Returns the error message for the most recent function in this API diff --git a/capi/src/scanner.rs b/capi/src/scanner.rs index f6aacedd0..704c196af 100644 --- a/capi/src/scanner.rs +++ b/capi/src/scanner.rs @@ -61,11 +61,11 @@ pub unsafe extern "C" fn yrx_scanner_set_timeout( scanner: *mut YRX_SCANNER, timeout: u64, ) -> YRX_RESULT { - if scanner.is_null() { - return YRX_RESULT::INVALID_ARGUMENT; - } + let scanner = match scanner.as_mut() { + Some(s) => s, + None => return YRX_RESULT::INVALID_ARGUMENT, + }; - let scanner = scanner.as_mut().unwrap(); scanner.inner.set_timeout(Duration::from_secs(timeout)); YRX_RESULT::SUCCESS @@ -84,16 +84,16 @@ pub unsafe extern "C" fn yrx_scanner_scan( ) -> YRX_RESULT { _yrx_set_last_error::(None); - if scanner.is_null() { - return YRX_RESULT::INVALID_ARGUMENT; - } + let scanner = match scanner.as_mut() { + Some(s) => s, + None => return YRX_RESULT::INVALID_ARGUMENT, + }; let data = match slice_from_ptr_and_len(data, len) { Some(data) => data, None => return YRX_RESULT::INVALID_ARGUMENT, }; - let scanner = scanner.as_mut().unwrap(); let scan_results = scanner.inner.scan(data); if let Err(err) = scan_results { @@ -178,9 +178,10 @@ pub unsafe extern "C" fn yrx_scanner_set_module_output( data: *const u8, len: usize, ) -> YRX_RESULT { - if scanner.is_null() { - return YRX_RESULT::INVALID_ARGUMENT; - } + let scanner = match scanner.as_mut() { + Some(s) => s, + None => return YRX_RESULT::INVALID_ARGUMENT, + }; let module_name = match CStr::from_ptr(name).to_str() { Ok(name) => name, @@ -195,8 +196,6 @@ pub unsafe extern "C" fn yrx_scanner_set_module_output( None => return YRX_RESULT::INVALID_ARGUMENT, }; - let scanner = scanner.as_mut().unwrap(); - match scanner.inner.set_module_output_raw(module_name, data) { Ok(_) => { _yrx_set_last_error::(None); @@ -216,9 +215,10 @@ unsafe extern "C" fn yrx_scanner_set_global< ident: *const c_char, value: T, ) -> YRX_RESULT { - if scanner.is_null() { - return YRX_RESULT::INVALID_ARGUMENT; - } + let scanner = match scanner.as_mut() { + Some(s) => s, + None => return YRX_RESULT::INVALID_ARGUMENT, + }; let ident = match CStr::from_ptr(ident).to_str() { Ok(ident) => ident, @@ -228,8 +228,6 @@ unsafe extern "C" fn yrx_scanner_set_global< } }; - let scanner = scanner.as_mut().unwrap(); - match scanner.inner.set_global(ident, value) { Ok(_) => { _yrx_set_last_error::(None); @@ -327,35 +325,40 @@ pub type YRX_MOST_EXPENSIVE_RULES_CALLBACK = extern "C" fn( /// Iterates over the top N most expensive rules, calling the callback for /// each rule. /// -/// Requires the `rules-profiling` feature. +/// Requires the `rules-profiling` feature, otherwise the /// /// See [`YRX_MOST_EXPENSIVE_RULES_CALLBACK`] for more details. -#[cfg(feature = "rules-profiling")] #[no_mangle] +#[allow(unused_variables)] pub unsafe extern "C" fn yrx_scanner_iter_most_expensive_rules( scanner: *mut YRX_SCANNER, n: usize, callback: YRX_MOST_EXPENSIVE_RULES_CALLBACK, user_data: *mut c_void, ) -> YRX_RESULT { - if scanner.is_null() { - return YRX_RESULT::INVALID_ARGUMENT; - } - - let scanner = scanner.as_ref().unwrap(); + #[cfg(not(feature = "rules-profiling"))] + return YRX_RESULT::NOT_SUPPORTED; + + #[cfg(feature = "rules-profiling")] + { + let scanner = match scanner.as_ref() { + Some(s) => s, + None => return YRX_RESULT::INVALID_ARGUMENT, + }; - for profiling_info in scanner.inner.most_expensive_rules(n) { - let namespace = CString::new(profiling_info.namespace).unwrap(); - let rule = CString::new(profiling_info.rule).unwrap(); + for profiling_info in scanner.inner.most_expensive_rules(n) { + let namespace = CString::new(profiling_info.namespace).unwrap(); + let rule = CString::new(profiling_info.rule).unwrap(); + + callback( + namespace.as_ptr(), + rule.as_ptr(), + profiling_info.pattern_matching_time.as_secs_f64(), + profiling_info.condition_exec_time.as_secs_f64(), + user_data, + ); + } - callback( - namespace.as_ptr(), - rule.as_ptr(), - profiling_info.pattern_matching_time.as_secs_f64(), - profiling_info.condition_exec_time.as_secs_f64(), - user_data, - ); + YRX_RESULT::SUCCESS } - - YRX_RESULT::SUCCESS } diff --git a/go/scanner.go b/go/scanner.go index 60be3a79a..e765012d3 100644 --- a/go/scanner.go +++ b/go/scanner.go @@ -247,6 +247,13 @@ func (s *Scanner) Scan(buf []byte) (*ScanResults, error) { return scanResults, err } +// ProfilingInfo contains profiling information about a YARA rule. +// +// For each rule it contains: the rule's namespace, the rule's name, +// the time spent in matching patterns declared by the rule, and the time +// spent evaluating the rule's condition. +// +// See [Scanner.MostExpensiveRules]. type ProfilingInfo struct { Namespace string Rule string @@ -276,16 +283,28 @@ func mostExpensiveRulesCallback( }) } +// MostExpensiveRules returns information about the slowest rules and how much +// time they spent matching patterns and executing their conditions. +// +// In order to use this function the YARA-X C library must be built with +// support for rules profiling, which is done by enabling the `rules-profiling` +// feature. Otherwise, calling this function will cause a panic. func (s *Scanner) MostExpensiveRules(n int) []ProfilingInfo { profilingInfo := make([]ProfilingInfo, 0) mostExpensiveRules := cgo.NewHandle(&profilingInfo) defer mostExpensiveRules.Delete() - if C._yrx_scanner_iter_most_expensive_rules( + result := C._yrx_scanner_iter_most_expensive_rules( s.cScanner, C.size_t(n), C.YRX_MOST_EXPENSIVE_RULES_CALLBACK(C.mostExpensiveRulesCallback), - C.uintptr_t(mostExpensiveRules)) != C.SUCCESS { + C.uintptr_t(mostExpensiveRules)) + + if result == C.NOT_SUPPORTED { + panic("MostExpensiveRules requires that the YARA-X C library is built with the `rules-profiling` feature") + } + + if result != C.SUCCESS { panic("yrx_scanner_iter_most_expensive_rules failed") } diff --git a/go/scanner_test.go b/go/scanner_test.go index 25937a639..bd59e2883 100644 --- a/go/scanner_test.go +++ b/go/scanner_test.go @@ -91,18 +91,6 @@ func TestScannerTimeout(t *testing.T) { assert.ErrorIs(t, err, ErrTimeout) } -func TestScannerMostExpensiveRules(t *testing.T) { - r, _ := Compile("rule t { strings: $a = /a(.*)*a/ condition: $a }") - s := NewScanner(r) - _, err := s.Scan(bytes.Repeat([]byte("a"), 5000)) - assert.NoError(t, err) - profilingInfo := s.MostExpensiveRules(1) - assert.Equal(t, "t", profilingInfo[0].Rule) - assert.Equal(t, "default", profilingInfo[0].Namespace) - assert.Greater(t, profilingInfo[0].PatternMatchingTime, time.Duration(0)) - assert.Greater(t, profilingInfo[0].ConditionExecTime, time.Duration(0)) -} - func TestScannerMetadata(t *testing.T) { r, _ := Compile(`rule t { meta: diff --git a/lib/src/modules/cuckoo/mod.rs b/lib/src/modules/cuckoo/mod.rs index 741ed3034..a086384c2 100644 --- a/lib/src/modules/cuckoo/mod.rs +++ b/lib/src/modules/cuckoo/mod.rs @@ -1,312 +1,249 @@ -use std::cell::RefCell; - #[cfg(feature = "logging")] use log::error; -use serde_json::{Map, Value}; use crate::compiler::RegexpId; use crate::modules::prelude::*; use crate::modules::protos::cuckoo::*; +mod schema; #[cfg(test)] mod tests; +use std::cell::RefCell; +use std::rc::Rc; thread_local! { - static CUCKOO_REPORT: RefCell>> = const { RefCell::new(None) }; + static LOCAL_DATA: RefCell>> = const { RefCell::new(None) }; } -#[module_main] -fn main(_data: &[u8], meta: Option<&[u8]>) -> Cuckoo { - if let Some(meta) = meta { - match serde_json::from_slice::(meta) { - Ok(Value::Object(json)) => CUCKOO_REPORT.set(Some(json)), - Ok(_) => { - #[cfg(feature = "logging")] - error!("cuckoo report is not a valid JSON") - } - #[cfg(feature = "logging")] - Err(err) => error!("can't parse cuckoo report: {}", err), - #[cfg(not(feature = "logging"))] - Err(_) => {} - } - } - Cuckoo::new() +fn get_local() -> Option> { + LOCAL_DATA.with(|data| data.borrow().clone()) } -#[module_export(name = "network.dns_lookup")] -fn network_dns_lookup(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - let find_match = |objects: &Vec, field_name: &str| { - objects.iter().any(|object| { - object - .get(field_name) - .and_then(|val| val.as_str()) - .map(|val| ctx.regexp_matches(regexp_id, val.as_bytes())) - .unwrap_or(false) - }) - }; +fn set_local(value: schema::CuckooJson) { + LOCAL_DATA.with(|data| { + *data.borrow_mut() = Some(Rc::new(value)); + }); +} - // The top-level object contains a "network" key that contains - // network-related information. - let network = report.as_ref().and_then(|report| report.get("network")); +#[module_main] +fn main(_data: &[u8], meta: Option<&[u8]>) -> Cuckoo { + let parsed = + serde_json::from_slice::(meta.unwrap_or_default()); - // Recent versions of Cuckoo generate domain resolution information with - // this format: - // - // "domains": [ - // { - // "ip": "192.168.0.1", - // "domain": "foo.bar.com" - // } - // ] - // - // But older versions with this other format: - // - // "dns": [ - // { - // "ip": "192.168.0.1", - // "hostname": "foo.bar.com" - // } - // ] - // - // Additionally, the newer versions also have a "dns" field. So, let's try - // to locate the "domains" field first, if not found fall back to the older - // format. - if network - .and_then(|report| report.get("domains")) - .and_then(|domains| domains.as_array()) - .map(|domains| find_match(domains, "domain")) - .unwrap_or(false) - { - return true; + match parsed { + Ok(parsed) => { + set_local(parsed); } - - if network - .and_then(|report| report.get("dns")) - .and_then(|dns| dns.as_array()) - .map(|dns| find_match(dns, "hostname")) - .unwrap_or(false) - { - return true; + #[cfg(feature = "logging")] + Err(e) => { + error!("can't parse cuckoo report: {}", e); } + #[cfg(not(feature = "logging"))] + Err(_) => {} + }; - false - }) -} - -enum RequestType { - Get, - Post, - Both, + Cuckoo::new() } -fn http_request( +#[module_export(name = "network.dns_lookup")] +fn network_dns_lookup_r( ctx: &ScanContext, regexp_id: RegexpId, - request_type: RequestType, -) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("network")) - .and_then(|network| network.get("http")) - .and_then(|http| http.as_array()) - .map(|http| { - http.iter().any(|request| { - let req_method = match request - .get("method") - .and_then(|req_method| req_method.as_str()) - { - Some(req_method) => req_method, - None => return false, - }; - - let req_uri = match request - .get("uri") - .and_then(|req_uri| req_uri.as_str()) - { - Some(req_uri) => req_uri, - None => return false, - }; - - match request_type { - RequestType::Get => { - if !req_method.eq_ignore_ascii_case("get") { - return false; - } - } - RequestType::Post => { - if !req_method.eq_ignore_ascii_case("post") { - return false; - } - } - RequestType::Both => { - if !req_method.eq_ignore_ascii_case("get") - && !req_method.eq_ignore_ascii_case("post") - { - return false; - } - } - } - - return ctx.regexp_matches(regexp_id, req_uri.as_bytes()); - }) +) -> Option { + Some( + get_local()? + .network + .domains + .iter() + .flatten() + .filter(|domain| { + ctx.regexp_matches(regexp_id, domain.domain.as_bytes()) }) - .unwrap_or(false) - }) + .count() as _, + ) } #[module_export(name = "network.http_request")] -fn network_http_request(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - http_request(ctx, regexp_id, RequestType::Both) +fn network_http_request_r( + ctx: &ScanContext, + regexp_id: RegexpId, +) -> Option { + Some( + get_local()? + .network + .http + .iter() + .flatten() + .filter(|http| { + http.method.is_some() // ~> is request (is not response) + && ctx.regexp_matches(regexp_id, http.uri.as_bytes()) + }) + .count() as _, + ) } #[module_export(name = "network.http_get")] -fn network_http_get(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - http_request(ctx, regexp_id, RequestType::Get) +fn network_http_get_r(ctx: &ScanContext, regexp_id: RegexpId) -> Option { + Some( + get_local()? + .network + .http + .iter() + .flatten() + .filter(|http| { + http.method + .as_ref() + .map(|method| method.eq_ignore_ascii_case("get")) + .unwrap_or(false) + && ctx.regexp_matches(regexp_id, http.uri.as_bytes()) + }) + .count() as _, + ) } #[module_export(name = "network.http_post")] -fn network_http_post(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - http_request(ctx, regexp_id, RequestType::Post) -} - -#[module_export(name = "network.http_user_agent")] -fn network_http_user_agent(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("network")) - .and_then(|network| network.get("http")) - .and_then(|http| http.as_array()) - .map(|http| { - http.iter() - .filter_map(|request| request.get("user-agent")) - .filter_map(|ua| ua.as_str()) - .any(|ua| ctx.regexp_matches(regexp_id, ua.as_bytes())) +fn network_http_post_r(ctx: &ScanContext, regexp_id: RegexpId) -> Option { + Some( + get_local()? + .network + .http + .iter() + .flatten() + .filter(|http| { + http.method + .as_ref() + .map(|method| method.eq_ignore_ascii_case("post")) + .unwrap_or(false) + && ctx.regexp_matches(regexp_id, http.uri.as_bytes()) }) - .unwrap_or(false) - }) + .count() as _, + ) } -fn network_conn( +#[module_export(name = "network.http_user_agent")] +fn network_http_user_agent_r( ctx: &ScanContext, regexp_id: RegexpId, - conn: &str, - port: i64, -) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("network")) - .and_then(|network| network.get(conn)) - .and_then(|connections| connections.as_array()) - .map(|connections| { - connections.iter().any(|conn| { - let dst_port = match conn - .get("dport") - .and_then(|dst_port| dst_port.as_i64()) - { - Some(dst_port) => dst_port, - None => return false, - }; - - let dst_addr = match conn - .get("dst") - .and_then(|dst_addr| dst_addr.as_str()) - { - Some(dst_addr) => dst_addr, - None => return false, - }; - - dst_port == port - && ctx.regexp_matches(regexp_id, dst_addr.as_bytes()) - }) +) -> Option { + Some( + get_local()? + .network + .http + .iter() + .flatten() + .flat_map(|http| http.user_agent.iter()) + .filter(|user_agent| { + ctx.regexp_matches(regexp_id, user_agent.as_bytes()) }) - .unwrap_or(false) - }) + .count() as _, + ) } #[module_export(name = "network.tcp")] -fn network_tcp(ctx: &ScanContext, regexp_id: RegexpId, port: i64) -> bool { - network_conn(ctx, regexp_id, "tcp", port) +fn network_tcp_ri( + ctx: &ScanContext, + dst_re: RegexpId, + port: i64, +) -> Option { + Some( + get_local()? + .network + .tcp + .iter() + .flatten() + .filter(|tcp| { + tcp.dport == port as u64 + && tcp + .dst + .iter() + .chain(tcp.dst_domain.iter()) + .any(|dst| ctx.regexp_matches(dst_re, dst.as_bytes())) + }) + .count() as _, + ) } #[module_export(name = "network.udp")] -fn network_udp(ctx: &ScanContext, regexp_id: RegexpId, port: i64) -> bool { - network_conn(ctx, regexp_id, "udp", port) +fn network_udp_ri( + ctx: &ScanContext, + dst_re: RegexpId, + port: i64, +) -> Option { + Some( + get_local()? + .network + .udp + .iter() + .flatten() + .filter(|udp| { + udp.dport == port as u64 + && udp + .dst + .iter() + .chain(udp.dst_domain.iter()) + .any(|dst| ctx.regexp_matches(dst_re, dst.as_bytes())) + }) + .count() as _, + ) } #[module_export(name = "network.host")] -fn network_host(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("network")) - .and_then(|network| network.get("hosts")) - .and_then(|hosts| hosts.as_array()) - .map(|hosts| { - hosts - .iter() - .filter_map(|host| host.as_str()) - .any(|host| ctx.regexp_matches(regexp_id, host.as_bytes())) - }) - .unwrap_or(false) - }) +fn network_host_r(ctx: &ScanContext, re: RegexpId) -> Option { + Some( + get_local()? + .network + .hosts + .iter() + .flatten() + .filter(|host| ctx.regexp_matches(re, host.as_bytes())) + .count() as _, + ) } #[module_export(name = "sync.mutex")] -fn sync_mutex(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("behavior")) - .and_then(|behaviour| behaviour.get("summary")) - .and_then(|summary| summary.get("mutexes")) - .and_then(|mutexes| mutexes.as_array()) - .map(|mutexes| { - mutexes - .iter() - .filter_map(|m| m.as_str()) - .any(|m| ctx.regexp_matches(regexp_id, m.as_bytes())) - }) - .unwrap_or(false) - }) +fn sync_mutex_r(ctx: &ScanContext, mutex_re: RegexpId) -> Option { + Some( + get_local()? + .behavior + .summary + .mutexes + .iter() + .flatten() + .filter(|mutex| ctx.regexp_matches(mutex_re, mutex.as_bytes())) + .count() as _, + ) } #[module_export(name = "filesystem.file_access")] -fn filesystem_file_access(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("behavior")) - .and_then(|behaviour| behaviour.get("summary")) - .and_then(|summary| summary.get("files")) - .and_then(|files| files.as_array()) - .map(|files| { - files - .iter() - .filter_map(|file| file.as_str()) - .any(|file| ctx.regexp_matches(regexp_id, file.as_bytes())) - }) - .unwrap_or(false) - }) +fn filesystem_file_access_r( + ctx: &ScanContext, + regexp_id: RegexpId, +) -> Option { + Some( + get_local()? + .behavior + .summary + .files + .iter() + .flatten() + .filter(|file| ctx.regexp_matches(regexp_id, file.as_bytes())) + .count() as _, + ) } #[module_export(name = "registry.key_access")] -fn registry_key_access(ctx: &ScanContext, regexp_id: RegexpId) -> bool { - CUCKOO_REPORT.with_borrow(|report| { - report - .as_ref() - .and_then(|report| report.get("behavior")) - .and_then(|behaviour| behaviour.get("summary")) - .and_then(|summary| summary.get("keys")) - .and_then(|keys| keys.as_array()) - .map(|keys| { - keys.iter() - .filter_map(|key| key.as_str()) - .any(|key| ctx.regexp_matches(regexp_id, key.as_bytes())) - }) - .unwrap_or(false) - }) +fn registry_key_access_r( + ctx: &ScanContext, + regexp_id: RegexpId, +) -> Option { + Some( + get_local()? + .behavior + .summary + .keys + .iter() + .flatten() + .filter(|key| ctx.regexp_matches(regexp_id, key.as_bytes())) + .count() as _, + ) } diff --git a/lib/src/modules/cuckoo/schema.rs b/lib/src/modules/cuckoo/schema.rs new file mode 100644 index 000000000..0502731c1 --- /dev/null +++ b/lib/src/modules/cuckoo/schema.rs @@ -0,0 +1,166 @@ +use std::fmt; + +use serde::de::Error; +use serde::{de::Visitor, Deserialize, Deserializer}; + +#[derive(serde::Deserialize, Debug)] +pub(super) struct DomainJson { + pub domain: String, +} + +#[derive(serde::Deserialize, Debug)] +pub(super) struct HttpJson { + #[serde(rename = "user-agent")] + pub user_agent: Option, + pub method: Option, // string ftw + pub uri: String, +} + +#[derive(serde::Deserialize, Debug)] +pub(super) struct TcpJson { + pub dst: Option, + pub dst_domain: Option, + pub dport: u64, +} + +#[derive(serde::Deserialize, Debug)] +pub(super) struct UdpJson { + pub dst: Option, + pub dst_domain: Option, + pub dport: u64, +} + +#[derive(/* serde::Deserialize, - custom */ Debug)] +pub(super) struct NetworkJson { + pub domains: Option>, + pub http: Option>, + pub tcp: Option>, + pub udp: Option>, + pub hosts: Option>, +} + +#[derive(serde::Deserialize, Debug)] +pub(super) struct SummaryJson { + pub mutexes: Option>, + pub files: Option>, + pub keys: Option>, +} + +#[derive(serde::Deserialize, Debug)] +pub(super) struct BehaviorJson { + pub summary: SummaryJson, +} + +#[derive(serde::Deserialize, Debug)] +pub(super) struct CuckooJson { + pub network: NetworkJson, + pub behavior: BehaviorJson, +} + +impl<'de> Deserialize<'de> for NetworkJson { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct MyVisitor; + + impl<'de> Visitor<'de> for MyVisitor { + type Value = NetworkJson; + + fn expecting(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.write_str("string or object") + } + + fn visit_map(self, mut map: A) -> Result + where + A: serde::de::MapAccess<'de>, + { + // Must not parse `old_domains` before the whole map is + // searched if there is a `domains` field, then the value for + // the key `old_domains` should be ignored - specifically, it + // is okay if the `old_domains` does not have the expected + // structure if `domains` is present. + let mut old_domains = None::; + let mut domains = None::; + + let mut http = None::>; + let mut tcp = None::>; + let mut udp = None::>; + let mut hosts = None::>; + + while let Some((key, val)) = + map.next_entry::()? + { + match key.as_str() { + "domains" => { + domains = Some(val); + } + "dns" => { + if domains.is_some() { + continue; // prefer "domains" over "dns" + } + old_domains = Some(val); + } + "http" => { + http = Some( + Deserialize::deserialize(val) + .map_err(Error::custom)?, + ); + } + "tcp" => { + tcp = Some( + Deserialize::deserialize(val) + .map_err(Error::custom)?, + ); + } + "udp" => { + udp = Some( + Deserialize::deserialize(val) + .map_err(Error::custom)?, + ); + } + "hosts" => { + hosts = Some( + Deserialize::deserialize(val) + .map_err(Error::custom)?, + ); + } + _ => {} + } + } + + #[derive(serde::Deserialize, Debug)] + struct OldDomainJson { + pub hostname: String, + } + + let domains: Option> = + match (domains, old_domains) { + (Some(domains), _) => { + Deserialize::deserialize(domains) + .map_err(Error::custom)? + } + (None, Some(old_domains)) => { + let old_domains: Vec = + Deserialize::deserialize(old_domains) + .map_err(Error::custom)?; + + Some( + old_domains + .into_iter() + .map(|old| DomainJson { + domain: old.hostname, + }) + .collect(), + ) + } + (None, None) => None, // domains field is optional + }; + + Ok(NetworkJson { domains, http, tcp, udp, hosts }) + } + } + + deserializer.deserialize_any(MyVisitor) + } +} diff --git a/lib/src/modules/pe/parser.rs b/lib/src/modules/pe/parser.rs index b0585504e..f268ddc91 100644 --- a/lib/src/modules/pe/parser.rs +++ b/lib/src/modules/pe/parser.rs @@ -1752,9 +1752,9 @@ impl<'a> PE<'a> { /// arrays equivalent to the INT and IAT. /// /// Another differences between ordinal and delayed imports is that in - /// in delayed imports the INT and IAT can contain virtual addresses - /// instead of relative virtual address (RVAs). Whether they contain one - /// or the other depends on a bit in the `attributes` field in the + /// delayed imports the INT and IAT can contain virtual addresses instead + /// of relative virtual address (RVAs). Whether they contain one or the + /// other depends on a bit in the `attributes` field in the /// IMAGE_DELAYLOAD_DESCRIPTOR structure. fn parse_import_impl

( &self, @@ -1789,6 +1789,8 @@ impl<'a> PE<'a> { }), ); + let mut num_imported_funcs = 0; + for mut descriptor in import_descriptors.take(Self::MAX_PE_IMPORTS) { // If the values in the descriptor are virtual addresses, convert // them to relative virtual addresses (RVAs) by subtracting the @@ -1900,10 +1902,11 @@ impl<'a> PE<'a> { } if !funcs.is_empty() { + num_imported_funcs += funcs.len(); imported_funcs.push((dll_name, funcs)); } - if imported_funcs.len() >= Self::MAX_PE_IMPORTS { + if num_imported_funcs >= Self::MAX_PE_IMPORTS { break; } } diff --git a/py/README.md b/py/README.md index 3114bb608..17a32f7e6 100644 --- a/py/README.md +++ b/py/README.md @@ -6,7 +6,7 @@ ![GitHub Repo stars](https://img.shields.io/github/stars/VirusTotal/yara-x) The official Python library for [YARA-X](https://virustotal.github.io/yara-x). -Supports Python 3.8+ in Linux, MacOS and Windows. +Supports Python 3.9+ in Linux, MacOS and Windows. ```python import yara_x diff --git a/py/pyproject.toml b/py/pyproject.toml index 6065eb583..36b8fd16e 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -1,11 +1,11 @@ [build-system] -requires = ["maturin>=1.0,<2.0"] +requires = ["maturin==1.7.4"] build-backend = "maturin" [project] name = "yara-x" description = "Python bindings for YARA-X" -requires-python = ">=3.8" +requires-python = ">=3.9" readme = "README.md" keywords = ["pattern-matching", "cybersecurity", "forensics", "malware", "yara"] classifiers = [ diff --git a/site/content/blog/rules-profiling/index.md b/site/content/blog/rules-profiling/index.md new file mode 100644 index 000000000..1c45b9840 --- /dev/null +++ b/site/content/blog/rules-profiling/index.md @@ -0,0 +1,91 @@ +--- +title: "Profiling your YARA rules" +description: "How to obtain information about the performance of your YARA rules" +summary: "" +date: 2024-11-26T00:00:00+01:00 +lastmod: 2024-11-26T00:00:00+01:00 +draft: false +weight: 50 +categories: [ ] +tags: [ ] +contributors: [ "Victor M. Alvarez" ] +pinned: false +homepage: false +seo: + title: "Rules profiling" # custom title (optional) + description: "Describes the new rules profiling feature introduced in YARA-X 0.11.0" # custom description (recommended) + canonical: "" # custom canonical URL (optional) + noindex: false # false (default) or true +--- + +Not all YARA rules perform equally; some can significantly slow down scanning +throughput. When working with a large set of rules, identifying which ones are +causing performance bottlenecks can be challenging, especially without the right +tools. + +To address this, YARA-X 0.11.0 introduces a new feature designed to streamline +the process of identifying slow rules: the `--profiling` option for +the `yr scan` +command. + +## Enabling rules profiling + +Because this feature incurs a slight performance overhead, it is disabled by +default. To use it, you must build YARA-X with profiling support enabled. This +can be done using the following command: + +```shell +cargo build --release --features=rules-profiling +``` + +Once built with profiling support, you can activate the feature by adding the +`--profiling` flag to the scan command. For example: + +```shell +yr scan --profiling my_rules.yar target_file +``` + +## How it works + +When the `--profiling` option is used, the `scan` command will operate as usual +while also collecting performance data for your rules. After the scan is +complete, +the profiling results will be displayed, highlighting the slowest rules and +their execution times. A sample output is shown below: + +``` +«««««««««««« PROFILING INFORMATION »»»»»»»»»»»» + +Slowest rules: + +* rule : some_slow_rule + namespace : default + pattern matching : 21.433µs + condition evaluation : 2.429054588s + TOTAL : 2.429076021s + +* rule : another_slow-rule + namespace : default + pattern matching : 5.790941033s + condition evaluation : 10.329µs + TOTAL : 5.790963123s +``` + +The profiling output lists the slowest-performing rules, ordered by total +execution time in descending order (the slowest rule appears first). Each +rule's performance is broken down into two components: + +* Pattern matching time: The time spent searching for patterns specified in the + rule. +* Condition evaluation time: The time spent evaluating the rule's conditions. + +By reporting these metrics separately, the profiling feature helps you determine +whether a rule's slowness is due to inefficient pattern matching or complex +condition evaluation. + +Rules with a total execution time below 100ms are excluded from the profiling +report to keep the output concise. If no rules meet the threshold, the profiling +section will remain empty, indicating that your rules are efficiently optimized. + +This new feature empowers users to fine-tune their rule sets by identifying and +addressing performance bottlenecks with ease. I hope you find it useful. \ No newline at end of file diff --git a/site/content/docs/api/python.md b/site/content/docs/api/python.md index 99f0f58e7..80a8c0875 100644 --- a/site/content/docs/api/python.md +++ b/site/content/docs/api/python.md @@ -22,7 +22,7 @@ Python is a popular language among YARA users. They use Python for all kinds of automation tasks, and the YARA-X ecosystem wouldn't be complete without the possibility of using it from Python programs. -YARA-X offers support for Python 3.8 or later, in Linux, MacOS and Windows. +YARA-X offers support for Python 3.9 or later, in Linux, MacOS and Windows. ## Installation diff --git a/site/hugo_stats.json b/site/hugo_stats.json index fd89f49f7..ccca8bba8 100644 --- a/site/hugo_stats.json +++ b/site/hugo_stats.json @@ -387,6 +387,9 @@ "dyn", "dyntype", "dysymtab", + "enabling-rules-profiling", + "enabling-the-profiling-feature", + "enabling-the-rules-profiling", "entitlement_hash", "entropyoffset-size", "entropystring", @@ -398,6 +401,7 @@ "example-10", "example-11", "example-12", + "example-13", "example-2", "example-3", "example-4", @@ -446,6 +450,7 @@ "hexinteger", "hexmessage-integer", "higher-overall-performance", + "how-it-works", "identifier", "identifier-1", "imphash", @@ -579,6 +584,7 @@ "stricter-escaped-characters-in-regular-expressions", "subsystem", "sym", + "sym_hash", "symbind", "symtab", "symtype",