Skip to content

Commit

Permalink
Merge pull request #446 from crocs-muni/feat/full-dset-archive-download
Browse files Browse the repository at this point in the history
Add a way to download full dataset archive (including PDFs) from the web.
  • Loading branch information
J08nY authored Oct 18, 2024
2 parents 17af7f6 + e7ba5ef commit 2a3d45c
Show file tree
Hide file tree
Showing 7 changed files with 339 additions and 74 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ df_2015_and_newer = df.loc[df.year_from > 2014]
df.year_from.value_counts().sort_index().plot.line()
```

<!-- ## Authors
## Authors

This work is being done at [CRoCS MUNI](https://crocs.fi.muni.cz/) by Adam Janovsky, Jan Jancar, Petr Svenda, Jiri Michalik, Lukasz Chmielewski and other contributors. This work was supported by the Internal grant agency of Masaryk University, CZ.02.2.69/0.0/0.0/19_073/0016943.

![](docs/_static/logolink_OP_VVV_hor_barva_eng.jpg) -->
![](docs/_static/logolink_OP_VVV_hor_barva_eng.jpg)
46 changes: 34 additions & 12 deletions notebooks/fips/vulnerabilities.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 1,
"id": "41674b9c",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-10-17T09:37:51.724995Z",
"start_time": "2024-10-17T09:37:51.033775Z"
}
},
"outputs": [],
"source": [
"from sec_certs.dataset.fips import FIPSDataset\n",
Expand All @@ -21,26 +26,43 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"id": "5ee5dca5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading FIPS dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 107M/107M [00:12<00:00, 8.88MB/s]\n",
"100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:10<00:00, 1.95it/s]\n",
"Building CVEDataset from jsons: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:15<00:00, 1.37it/s]\n",
"parsing cpe matching (by NIST) dictionary: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400508/400508 [00:26<00:00, 15291.66it/s]\n",
"Building-up lookup dictionaries for fast CVE matching: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198239/198239 [00:09<00:00, 20313.13it/s]\n"
"Downloading FIPS Dataset: 61.7MB [00:12, 5.09MB/s]\n"
]
}
],
"source": [
"dset = FIPSDataset.from_web_latest()\n",
"cve_dset: CVEDataset = dset._prepare_cve_dataset()\n",
"cpe_dset: CPEDataset = dset._prepare_cpe_dataset()"
"#cve_dset: CVEDataset = dset._prepare_cve_dataset()\n",
"#cpe_dset: CPEDataset = dset._prepare_cpe_dataset()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "303824ee-a101-492d-8505-3e1f96a04d69",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/this/is/dummy/nonexisting/path/auxiliary_datasets/cpe_dataset.json')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dset.cpe_dataset_path"
]
},
{
Expand Down Expand Up @@ -568,7 +590,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.13 ('venv': venv)",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -582,7 +604,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
"version": "3.12.6"
},
"vscode": {
"interpreter": {
Expand Down
48 changes: 35 additions & 13 deletions src/sec_certs/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ class Configuration(BaseSettings):
description=" During validation we don't connect certificates with validation dates difference higher than _this_.",
)
n_threads: int = Field(
-1, description="How many threads to use for parallel computations. Set to -1 to use all logical cores.", ge=-1
-1,
description="How many threads to use for parallel computations. Set to -1 to use all logical cores.",
ge=-1,
)
cpe_matching_threshold: int = Field(
92,
Expand All @@ -40,12 +42,18 @@ class Configuration(BaseSettings):
le=100,
)
cpe_n_max_matches: int = Field(
99, description="Maximum number of candidate CPE items that may be related to given certificate, >0", gt=0
99,
description="Maximum number of candidate CPE items that may be related to given certificate, >0",
gt=0,
)
cc_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/cc/dataset.json",
description="URL from where to fetch the latest snapshot of fully processed CC dataset.",
)
cc_latest_full_archive: AnyHttpUrl = Field(
"https://sec-certs.org/cc/cc.tar.gz",
description="URL from where to fetch the latest full archive of fully processed CC dataset.",
)
cc_maintenances_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/cc/maintenance_updates.json",
description="URL from where to fetch the latest snapshot of CC maintenance updates",
Expand All @@ -55,25 +63,36 @@ class Configuration(BaseSettings):
description="URL from where to fetch the latest snapshot of the PP dataset.",
)
fips_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/fips/dataset.json", description="URL for the latest snapshot of FIPS dataset."
"https://sec-certs.org/fips/dataset.json",
description="URL for the latest snapshot of FIPS dataset.",
)
fips_latest_full_archive: AnyHttpUrl = Field(
"https://sec-certs.org/fips/fips.tar.gz",
description="URL from where to fetch the latest full archive of fully processed FIPS dataset.",
)
fips_iut_dataset: AnyHttpUrl = Field(
"https://sec-certs.org/fips/iut/dataset.json", description="URL for the dataset of FIPS IUT data."
"https://sec-certs.org/fips/iut/dataset.json",
description="URL for the dataset of FIPS IUT data.",
)
fips_iut_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/fips/iut/latest.json", description="URL for the latest snapshot of FIPS IUT data."
"https://sec-certs.org/fips/iut/latest.json",
description="URL for the latest snapshot of FIPS IUT data.",
)
fips_mip_dataset: AnyHttpUrl = Field(
"https://sec-certs.org/fips/mip/dataset.json", description="URL for the dataset of FIPS MIP data"
"https://sec-certs.org/fips/mip/dataset.json",
description="URL for the dataset of FIPS MIP data",
)
fips_mip_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/fips/mip/latest.json", description="URL for the latest snapshot of FIPS MIP data"
"https://sec-certs.org/fips/mip/latest.json",
description="URL for the latest snapshot of FIPS MIP data",
)
cpe_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/vuln/cpe/cpe.json.gz", description="URL for the latest snapshot of CPEDataset."
"https://sec-certs.org/vuln/cpe/cpe.json.gz",
description="URL for the latest snapshot of CPEDataset.",
)
cve_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/vuln/cve/cve.json.gz", description="URL for the latest snapshot of CVEDataset."
"https://sec-certs.org/vuln/cve/cve.json.gz",
description="URL for the latest snapshot of CVEDataset.",
)
cpe_match_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/vuln/cpe/cpe_match.json.gz",
Expand All @@ -91,14 +110,16 @@ class Configuration(BaseSettings):
ge=0,
)
ignore_first_page: bool = Field(
True, description="During keyword search, first page usually contains addresses - ignore it."
True,
description="During keyword search, first page usually contains addresses - ignore it.",
)
cc_reference_annotator_dir: Optional[Path] = Field( # noqa: UP007
None,
description="Path to directory with serialized reference annotator model. If set to `null`, tool will search default directory for the given dataset.",
)
cc_reference_annotator_should_train: bool = Field(
True, description="True if new reference annotator model shall be build, False otherwise."
True,
description="True if new reference annotator model shall be build, False otherwise.",
)
cc_matching_threshold: int = Field(
90,
Expand All @@ -109,14 +130,15 @@ class Configuration(BaseSettings):
cc_use_proxy: bool = Field(False, description="Download CC artifacts through the sec-certs.org proxy.")
fips_use_proxy: bool = Field(False, description="Download FIPS artifacts through the sec-certs.org proxy.")
enable_progress_bars: bool = Field(
True, description="If true, progress bars will be printed to stdout during computation."
True,
description="If true, progress bars will be printed to stdout during computation.",
)
nvd_api_key: Optional[str] = Field(None, description="NVD API key for access to CVEs and CPEs.") # noqa: UP007
preferred_source_nvd_datasets: Literal["sec-certs", "api"] = Field(
"sec-certs",
description="If set to `sec-certs`, will fetch CPE and CVE datasets from sec-certs.org."
+ " If set to `api`, will fetch these resources from NVD API. It is advised to set an"
+ " `nvd_api_key` when setting this to `nvd`.",
+ " `nvd_api_key` when setting this to `api`.",
)

def _get_nondefault_keys(self) -> set[str]:
Expand Down
1 change: 1 addition & 0 deletions src/sec_certs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
REF_EMBEDDING_METHOD = Literal["tf_idf", "transformer"]


# This stupid thing should die in a fire...
DUMMY_NONEXISTING_PATH = Path("/this/is/dummy/nonexisting/path")

RESPONSE_OK = 200
Expand Down
Loading

0 comments on commit 2a3d45c

Please sign in to comment.