diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 8c2eb4ed..379f7b95 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -31,15 +31,15 @@ jobs: run: pip install pytest pytest-cov - name: Install kestrel [all packages] run: make install - - name: Coverage for kestrel_core and kestrel_datasource_stixbundle + - name: Coverage for kestrel_core working-directory: ./packages/kestrel_core - run: pytest -vv --cov-report=xml --cov=kestrel --cov=kestrel_datasource_stixbundle - - name: Coverage for kestrel_datasource_stixshifter - working-directory: ./packages/kestrel_datasource_stixshifter - run: pytest -vv --cov-report=xml --cov=kestrel_datasource_stixshifter - - name: Coverage for kestrel_analytics_python - working-directory: ./packages/kestrel_analytics_python - run: pytest -vv --cov-report=xml --cov=kestrel_analytics_python + run: pytest -vv --cov-report=xml --cov=kestrel + - name: Coverage for kestrel_interface_opensearch + working-directory: ./packages/kestrel_interface_opensearch + run: pytest -vv --cov-report=xml --cov=kestrel_interface_opensearch + - name: Coverage for kestrel_interface_sqlalchemy + working-directory: ./packages/kestrel_interface_sqlalchemy + run: pytest -vv --cov-report=xml --cov=kestrel_interface_sqlalchemy - name: Coverage for kestrel_jupyter working-directory: ./packages/kestrel_jupyter run: pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed -r 's/^| / --cov=/g') @@ -47,5 +47,5 @@ jobs: uses: codecov/codecov-action@v3 with: fail_ci_if_error: false - files: ./packages/kestrel_core/coverage.xml,./packages/kestrel_datasource_stixshifter/coverage.xml,./packages/kestrel_analytics_python/coverage.xml,./packages/kestrel_jupyter/coverage.xml + files: ./packages/kestrel_core/coverage.xml,./packages/kestrel_interface_opensearch/coverage.xml,./packages/kestrel_interface_sqlalchemy/coverage.xml,./packages/kestrel_jupyter/coverage.xml verbose: true diff --git a/Makefile b/Makefile index dabc3ad2..d83f4459 100644 --- a/Makefile +++ b/Makefile @@ -5,23 +5,15 @@ kestrel_core: cd packages/kestrel_core; pip install . ## Install STIX bundle data source package -kestrel_datasource_stixbundle: kestrel_core - cd packages/kestrel_datasource_stixbundle; pip install . +kestrel_interface_opensearch: kestrel_core + cd packages/kestrel_interface_opensearch; pip install . ## Install STIX-Shifter data source package -kestrel_datasource_stixshifter: kestrel_core - cd packages/kestrel_datasource_stixshifter; pip install . - -## Install docker analytics -kestrel_analytics_docker: kestrel_core - cd packages/kestrel_analytics_docker; pip install . - -## Install python analytics -kestrel_analytics_python: kestrel_core - cd packages/kestrel_analytics_python; pip install . +kestrel_interface_sqlalchemy: kestrel_core + cd packages/kestrel_interface_sqlalchemy; pip install . ## Install Kestrel kernel for Jupyter -kestrel_jupyter: kestrel_datasource_stixbundle kestrel_datasource_stixshifter kestrel_analytics_docker kestrel_analytics_python +kestrel_jupyter: kestrel_interface_opensearch kestrel_interface_sqlalchemy cd packages/kestrel_jupyter; pip install .; kestrel_jupyter_setup ## Install Kestrel kernel for Jupyter diff --git a/packages/kestrel_core/src/kestrel/config/kestrel.yaml b/packages/kestrel_core/src/kestrel/config/kestrel.yaml index ccdd38b1..328bbffa 100644 --- a/packages/kestrel_core/src/kestrel/config/kestrel.yaml +++ b/packages/kestrel_core/src/kestrel/config/kestrel.yaml @@ -1,70 +1,26 @@ # syntax default values language: - default_variable: "_" default_sort_order: "desc" - default_datasource_schema: "stixshifter" - default_analytics_schema: "python" - -# how a Kestrel session is executed -session: - cache_directory_prefix: "kestrel-session-" # under system temp directory - local_database_path: "local.db" - log_path: "session.log" - show_execution_summary: true - -# whether/how to prefetch all records/observations for entities -prefetch: - - # enable/disable prefetch for command - # - # If prefetch is enabled, Kestrel will send additional queries to the data - # source to search for related records regarding entities retrieved from the - # user-specified pattern, collecting more complete information (attributes, - # connections to other entities) of the entities from different records. - switch_per_command: - get: true - find: true - - # declare the list of entity types to not prefetch - # - # This can be used when a user finds prefetch hinders the performance with - # large amount of results for one or more generic type of entities. For - # example, the data source may have millions of records containing - # `C:\Windows\SYSTEM32\ntdll.dll` touched by all Windows processes in a short - # amount of time. Executing a Kestrel command `f = FIND file LINKED p` will - # retrieve the file from a process and then start prefetch to gain - # information/connections of the file from all processes. Retrieval of - # millions records will likely result in a performance issue, thus the user - # can put `file` in this list to disable prefetch for it. - excluded_entities: - - - # - file - # - user-account - # - x-oca-asset - - # Detailed logic to identify the same process from different records is more - # complex than many data source query language can express, so Kestrel - # retrieves potential same process candidate records and perform fine-grained - # process identification in Kestrel with these parameters. - process_identification: - pid_but_name_changed_time_begin_offset: -5 # seconds - pid_but_name_changed_time_end_offset: 5 # seconds - pid_and_name_time_begin_offset: -3600 # seconds - pid_and_name_time_end_offset: 3600 # seconds - pid_and_ppid_time_begin_offset: -3600 # seconds - pid_and_ppid_time_end_offset: 3600 # seconds - pid_and_name_and_ppid_time_begin_offset: -86400 # seconds - pid_and_name_and_ppid_time_end_offset: 86400 # seconds - -# option when generating STIX query -stixquery: - timerange_start_offset: -300 # seconds - timerange_stop_offset: 300 # seconds - support_id: false # STIX 2.0 does not support unique ID # debug options debug: env_var: "KESTREL_DEBUG" # debug mode if the environment variable exists - cache_directory_prefix: "kestrel-" # under system temp directory - session_exit_marker: "session.exited" - maximum_exited_session: 3 + cache_directory_path: "~/kestrel-debug-session" # put in user's home directory by default + +# default identifier attribute(s) of an entity across all datasource interfaces +# if multiple attributes are specified, logic AND will be added in between +# each datasource interface config could have the same section to override this +entity_identifier: + file: "hashes[?algorithm_id == 3]" # sha256 + group: uid + process: uid + endpoint: ip + device: ip + src_endpoint: + - ip + - port + dst_endpoint: + - ip + - port + certificate: serial_number + user: uid diff --git a/packages/kestrel_core/src/kestrel/config/utils.py b/packages/kestrel_core/src/kestrel/config/utils.py index 0b912e7a..90472e91 100644 --- a/packages/kestrel_core/src/kestrel/config/utils.py +++ b/packages/kestrel_core/src/kestrel/config/utils.py @@ -6,6 +6,7 @@ from typing import Mapping, Union from kestrel.utils import update_nested_dict, load_data_file +from kestrel.exceptions import InvalidYamlInConfig CONFIG_DIR_DEFAULT = Path.home() / ".config" / "kestrel" CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml" @@ -14,6 +15,27 @@ _logger = logging.getLogger(__name__) +@typechecked +def load_leaf_yaml(config: Mapping, path_dir: str) -> Mapping: + new = {} + for k, v in config.items(): + if isinstance(v, Mapping): + new[k] = load_leaf_yaml(v, path_dir) + elif isinstance(v, str) and v.endswith(".yaml"): + try: + if os.path.isabs(v): + with open(v, "r") as fp: + new[k] = yaml.safe_load(fp.read()) + else: + with open(os.path.join(path_dir, v), "r") as fp: + new[k] = yaml.safe_load(fp.read()) + except: + raise InvalidYamlInConfig(v) + else: + new[k] = v + return new + + @typechecked def load_default_config() -> Mapping: _logger.debug(f"Loading default config file...") @@ -36,13 +58,14 @@ def load_user_config( with open(config_path, "r") as fp: _logger.debug(f"User configuration file found: {config_path}") config = yaml.safe_load(os.path.expandvars(fp.read())) + config = load_leaf_yaml(config, os.path.dirname(config_path)) except FileNotFoundError: _logger.debug(f"User configuration file not exist.") return config @typechecked -def load_config() -> Mapping: +def load_kestrel_config() -> Mapping: config_default = load_default_config() config_user = load_user_config(CONFIG_PATH_ENV_VAR, CONFIG_PATH_DEFAULT) _logger.debug(f"User configuration loaded: {config_user}") diff --git a/packages/kestrel_core/src/kestrel/exceptions.py b/packages/kestrel_core/src/kestrel/exceptions.py index cd088afe..837ca233 100644 --- a/packages/kestrel_core/src/kestrel/exceptions.py +++ b/packages/kestrel_core/src/kestrel/exceptions.py @@ -22,6 +22,10 @@ class InvalidDataSource(KestrelError): pass +class InvalidYamlInConfig(KestrelError): + pass + + class VariableNotFound(KestrelError): pass diff --git a/packages/kestrel_core/src/kestrel/frontend/compile.py b/packages/kestrel_core/src/kestrel/frontend/compile.py index cb1f897f..6bf12706 100644 --- a/packages/kestrel_core/src/kestrel/frontend/compile.py +++ b/packages/kestrel_core/src/kestrel/frontend/compile.py @@ -1,7 +1,7 @@ # Lark Transformer import logging -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from functools import reduce from dateutil.parser import parse as to_datetime @@ -158,14 +158,12 @@ def _add_reference_branches_for_filter(graph: IRGraph, filter_node: Filter): class _KestrelT(Transformer): def __init__( self, - default_variable=DEFAULT_VARIABLE, default_sort_order=DEFAULT_SORT_ORDER, token_prefix="", entity_map={}, property_map={}, ): # token_prefix is the modification by Lark when using `merge_transformers()` - self.default_variable = default_variable self.default_sort_order = default_sort_order self.token_prefix = token_prefix self.entity_map = entity_map @@ -352,7 +350,7 @@ def timespan_relative(self, args): delta = timedelta(minutes=num) elif unit == "SECOND": delta = timedelta(seconds=num) - stop = datetime.utcnow() + stop = datetime.now(timezone.utc) start = stop - delta return TimeRange(start, stop) diff --git a/packages/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml index d4a1bf75..6d485ef5 100644 --- a/packages/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ b/packages/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml @@ -147,11 +147,6 @@ endpoint: - source.mac - server.mac - destination.mac - port: - - client.port - - source.port - - server.port - - destination.port # dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint diff --git a/packages/kestrel_core/tests/test_config.py b/packages/kestrel_core/tests/test_config.py index 2fcec65a..00f9e0fd 100644 --- a/packages/kestrel_core/tests/test_config.py +++ b/packages/kestrel_core/tests/test_config.py @@ -1,4 +1,4 @@ -import kestrel.config.utils as cfg +from kestrel.config.utils import load_kestrel_config import os @@ -15,7 +15,7 @@ def test_env_vars_in_config(): with open(os.getenv("KESTREL_CONFIG"), "w") as fp: fp.write(test_config) - config = cfg.load_config() + config = load_kestrel_config() assert config["credentials"]["username"] == "test-user" assert config["credentials"]["password"] == "test-password" @@ -35,7 +35,7 @@ def test_env_vars_in_config_overwrite(): os.environ["KESTREL_CACHE_DIRECTORY_PREFIX"] = "Kestrel2.0-" with open(os.getenv("KESTREL_CONFIG"), "w") as fp: fp.write(test_config) - config = cfg.load_config() + config = load_kestrel_config() assert config["credentials"]["username"] == "test-user" assert config["credentials"]["password"] == "test-password" assert config["debug"]["cache_directory_prefix"] == "Kestrel2.0-" @@ -54,7 +54,28 @@ def test_empty_env_var_in_config(): os.environ["KESTREL_CACHE_DIRECTORY_PREFIX"] = "Kestrel2.0-" with open(os.getenv("KESTREL_CONFIG"), "w") as fp: fp.write(test_config) - config = cfg.load_config() + config = load_kestrel_config() assert config["credentials"]["username"] == "test-user" assert config["credentials"]["password"] == "test-password" - assert config["debug"]["cache_directory_prefix"] == "$I_DONT_EXIST" \ No newline at end of file + assert config["debug"]["cache_directory_prefix"] == "$I_DONT_EXIST" + +def test_yaml_load_in_config(tmp_path): + test_config = """--- +credentials: + username: ${TEST_USER} + password: ${TEST_PASSWORD} +loadtest: + xyz: + abc: abc.yaml + """ + os.environ["TEST_USER"] = "test-user" + os.environ["TEST_PASSWORD"] = "test-password" + os.environ["KESTREL_CONFIG"] = os.path.join(tmp_path, "config.yaml") + with open(os.getenv("KESTREL_CONFIG"), "w") as fp: + fp.write(test_config) + with open(os.path.join(tmp_path, "abc.yaml"), "w") as fp: + fp.write("test: fake-value") + config = load_kestrel_config() + assert config["credentials"]["username"] == "test-user" + assert config["credentials"]["password"] == "test-password" + assert config["loadtest"]["xyz"]["abc"]["test"] == "fake-value" diff --git a/packages/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py b/packages/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py index c001295f..04467fdd 100644 --- a/packages/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py +++ b/packages/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py @@ -1,13 +1,13 @@ import logging -from dataclasses import dataclass, field -from typing import Dict, Mapping, Optional +from dataclasses import dataclass +from typing import Dict, Optional -import yaml from mashumaro.mixins.json import DataClassJSONMixin from kestrel.config.utils import ( CONFIG_DIR_DEFAULT, load_user_config, + load_kestrel_config, ) from kestrel.exceptions import InterfaceNotConfigured from kestrel.mapping.data_model import load_default_mapping @@ -41,14 +41,11 @@ class DataSource(DataClassJSONMixin): index_pattern: str timestamp: str timestamp_format: str - data_model_mapping: Optional[str] = None # Filename for mapping - data_model_map: Mapping = field(default_factory=dict) + data_model_map: Optional[Dict] = None + entity_identifier: Optional[Dict] = None def __post_init__(self): - if self.data_model_mapping: - with open(self.data_model_mapping, "r") as fp: - self.data_model_map = yaml.safe_load(fp) - else: + if not self.data_model_map: # Default to the built-in ECS mapping self.data_model_map = load_default_mapping("ecs") @@ -65,6 +62,16 @@ def __post_init__(self): def load_config(): try: - return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + interface_config = Config( + **load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT) + ) + + # load default entity identifier from main Kestrel config + kestrel_config = load_kestrel_config() + for ds in interface_config.datasources.values(): + if not ds.entity_identifier: + ds.entity_identifier = kestrel_config["entity_identifier"] + + return interface_config except TypeError: raise InterfaceNotConfigured() diff --git a/packages/kestrel_interface_opensearch/tests/test_config.py b/packages/kestrel_interface_opensearch/tests/test_config.py index 7362a891..b17cc49b 100644 --- a/packages/kestrel_interface_opensearch/tests/test_config.py +++ b/packages/kestrel_interface_opensearch/tests/test_config.py @@ -35,7 +35,7 @@ def test_load_config(tmp_path): "index_pattern": "logs-*", "timestamp": "@timestamp", "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", - "data_model_mapping": str(tmp_path / "mapping.yaml") + "data_model_map": "mapping.yaml" } } } @@ -51,3 +51,5 @@ def test_load_config(tmp_path): assert conn.url == config["connections"]["localhost"]["url"] assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"] assert read_config.datasources["some_ds"].index_pattern == config["datasources"]["some_ds"]["index_pattern"] + assert read_config.datasources["some_ds"].data_model_map["some.field"] == "other.field" + assert read_config.datasources["some_ds"].entity_identifier["process"] == "uid" diff --git a/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py b/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py index fba0f182..bb59fef0 100644 --- a/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py +++ b/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py @@ -1,13 +1,13 @@ import logging -from dataclasses import dataclass, field -from typing import Dict, Mapping, Optional +from dataclasses import dataclass +from typing import Dict, Optional -import yaml from mashumaro.mixins.json import DataClassJSONMixin from kestrel.config.utils import ( CONFIG_DIR_DEFAULT, load_user_config, + load_kestrel_config, ) from kestrel.exceptions import InterfaceNotConfigured from kestrel.mapping.data_model import load_default_mapping @@ -30,14 +30,11 @@ class DataSource(DataClassJSONMixin): table: str timestamp: str timestamp_format: str - data_model_mapping: Optional[str] = None # Filename for mapping - data_model_map: Mapping = field(default_factory=dict) + data_model_map: Optional[Dict] = None + entity_identifier: Optional[Dict] = None def __post_init__(self): - if self.data_model_mapping: - with open(self.data_model_mapping, "r") as fp: - self.data_model_map = yaml.safe_load(fp) - else: + if not self.data_model_map: # Default to the built-in ECS mapping self.data_model_map = load_default_mapping("ecs") # FIXME: need a default? @@ -54,6 +51,16 @@ def __post_init__(self): def load_config(): try: - return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + interface_config = Config( + **load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT) + ) + + # load default entity identifier from main Kestrel config + kestrel_config = load_kestrel_config() + for ds in interface_config.datasources.values(): + if not ds.entity_identifier: + ds.entity_identifier = kestrel_config["entity_identifier"] + + return interface_config except TypeError: raise InterfaceNotConfigured() diff --git a/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/example.yaml b/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/example.yaml index b3e465f7..864a4d68 100644 --- a/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/example.yaml +++ b/packages/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/example.yaml @@ -18,4 +18,5 @@ datasources: table: my_events # actual SQL table name timestamp: "@timestamp" timestamp_format: "%Y-%m-%d %H:%M:%S.%fZ" - data_model_mapping: "my_events_mapping.yaml" + data_model_map: "my_events_mapping.yaml" + entity_identifier: "my_events_entity_id.yaml" # override it from kestrel.yaml diff --git a/packages/kestrel_interface_sqlalchemy/tests/test_config.py b/packages/kestrel_interface_sqlalchemy/tests/test_config.py index 0c85e82e..e7fa42c9 100644 --- a/packages/kestrel_interface_sqlalchemy/tests/test_config.py +++ b/packages/kestrel_interface_sqlalchemy/tests/test_config.py @@ -8,6 +8,29 @@ load_config, ) +def test_load_config_w_default_map(tmp_path): + config = { + "connections": { + "some-data-lake": { + "url": "presto://jdoe@example.com:8889/hive", + } + }, + "datasources": { + "cloud_table": { + "connection": "some-data-lake", + "table": "cloud_table", + "timestamp": "eventTime", + "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", + } + } + } + config_file = tmp_path / "sqlalchemy.yaml" + with open(config_file, 'w') as fp: + yaml.dump(config, fp) + os.environ[PROFILE_PATH_ENV_VAR] = str(config_file) + read_config = load_config() + assert read_config.datasources["cloud_table"].data_model_map["process"]["name"] == "process.name" + assert read_config.datasources["cloud_table"].entity_identifier["process"] == "uid" def test_load_config(tmp_path): config = { @@ -25,13 +48,17 @@ def test_load_config(tmp_path): "table": "cloud_table", "timestamp": "eventTime", "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", - "data_model_mapping": str(tmp_path / "mapping.yaml") + "data_model_map": str(tmp_path / "mapping.yaml"), + "entity_identifier": "eid.yaml" } } } map_file = tmp_path / "mapping.yaml" with open(map_file, 'w') as fp: fp.write("some.field: other.field\n") + eid_file = tmp_path / "eid.yaml" + with open(eid_file, 'w') as fp: + fp.write("process: pid\n") config_file = tmp_path / "sqlalchemy.yaml" with open(config_file, 'w') as fp: yaml.dump(config, fp) @@ -42,3 +69,5 @@ def test_load_config(tmp_path): assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"] assert read_config.datasources["cloud_table"].timestamp == config["datasources"]["cloud_table"]["timestamp"] assert read_config.datasources["cloud_table"].table == config["datasources"]["cloud_table"]["table"] + assert read_config.datasources["cloud_table"].data_model_map["some.field"] == "other.field" + assert read_config.datasources["cloud_table"].entity_identifier["process"] == "pid"