Skip to content

Commit

Permalink
Allow row selection based on attached database
Browse files Browse the repository at this point in the history
  • Loading branch information
dspinellis committed Jan 29, 2023
1 parent 87b7a14 commit 87a9b39
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 14 deletions.
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,16 @@ SELECT COUNT(*) FROM funder_awards;
SELECT COUNT(*) FROM work_references;
```

### Record selection from external database
The following command creates an SQLite database with all Crossref data
of works whose DOI appears in the attached database named `selected.db`.
```sh
alexandria3k --data-source Crossref 'April 2022 Public Data File from Crossref' \
--populate-db-path selected-works.db \
--attach-databases 'attached:selected.db' \
--row-selection "EXISTS (SELECT 1 FROM attached.selected_dois WHERE works.doi = selected_dois.doi)"
```

### Populate the database with author records from ORCID
Only records of authors identified in the publications through an
ORCID will be added.
Expand Down Expand Up @@ -332,7 +342,8 @@ SELECT author_affiliations.name FROM
## Command-line options reference
<!-- CLI start -->
```
usage: alexandria3k [-h] [-c COLUMNS [COLUMNS ...]] [-D DEBUG [DEBUG ...]]
usage: alexandria3k [-h] [-a ATTACH_DATABASES [ATTACH_DATABASES ...]]
[-c COLUMNS [COLUMNS ...]] [-D DEBUG [DEBUG ...]]
[-d DATA_SOURCE [DATA_SOURCE ...]] [-E OUTPUT_ENCODING]
[-F FIELD_SEPARATOR] [-H] [-i [INDEX ...]]
[-L LIST_SCHEMA] [-l LINKED_RECORDS] [-n] [-o OUTPUT] [-P]
Expand All @@ -344,6 +355,8 @@ alexandria3k: Publication metadata interface
optional arguments:
-h, --help show this help message and exit
-a ATTACH_DATABASES [ATTACH_DATABASES ...], --attach-databases ATTACH_DATABASES [ATTACH_DATABASES ...]
Databases to attach for the row selection query
-c COLUMNS [COLUMNS ...], --columns COLUMNS [COLUMNS ...]
Columns to populate using table.column or table.*
-D DEBUG [DEBUG ...], --debug DEBUG [DEBUG ...]
Expand Down Expand Up @@ -496,6 +509,17 @@ crossref_instance.populate(
)
```

### Record selection from external database
The following command creates an SQLite database with all Crossref data
of works whose DOI appears in the attached database named `selected.db`.
```py
crossref_instance.populate(
"selected-works.db",
condition="EXISTS (SELECT 1 FROM attached.selected_dois WHERE works.doi = selected_dois.doi)",
["attached:selected.db"]
)
```

### Populate the database from ORCID
Add tables containing author country and education organization.
Only records of authors identified in the publications through an
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "alexandria3k"
version = "2.3.3"
version = "2.4.0"
authors = [
{ name="Diomidis Spinellis", email="[email protected]" },
]
Expand Down
12 changes: 11 additions & 1 deletion src/alexandria3k/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,13 @@ def sql_value(database, statement):
def parse_cli_arguments(parser, args=None):
"""Parse command line arguments (or args e.g. when testing)"""

parser.add_argument(
"-a",
"--attach-databases",
nargs="+",
type=str,
help="Databases to attach for the row selection query",
)
parser.add_argument(
"-c",
"--columns",
Expand Down Expand Up @@ -431,7 +438,10 @@ def main():

if crossref_instance and args.populate_db_path:
crossref_instance.populate(
args.populate_db_path, args.columns, args.row_selection
args.populate_db_path,
args.columns,
args.row_selection,
args.attach_databases,
)
debug.log("files-read", f"{FileCache.file_reads} files read")
perf.log("Crossref table population")
Expand Down
45 changes: 34 additions & 11 deletions src/alexandria3k/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,7 @@ class Crossref:
"""Create a Crossref meta-data object that support queries over its
(virtual) table and the population of an SQLite database with its
data"""
# pylint: disable=too-many-instance-attributes

def __init__(self, crossref_directory, sample=lambda n: True):
# A named in-memory database; it can be attached by name to others
Expand All @@ -899,6 +900,7 @@ def __init__(self, crossref_directory, sample=lambda n: True):
self.population_columns = {}
self.query_and_population_columns = {}
self.index_manager = None
self.attached_databases = []

for table in tables:
self.vdb.execute(
Expand Down Expand Up @@ -929,9 +931,13 @@ def trace_query_columns(query):
self.query_columns.
See https://rogerbinns.github.io/apsw/tips.html#parsing-sql"""

def authorizer(op_code, table, column, _database, _trigger):
def authorizer(op_code, table, column, database, _trigger):
"""Query authorizer to monitor used columns"""
if op_code == apsw.SQLITE_READ and column:
if (
op_code == apsw.SQLITE_READ
and column
and database not in self.attached_databases
):
# print(f"AUTH: adding {table}.{column}")
Crossref.add_column(self.query_columns, table, column)
return apsw.SQLITE_OK
Expand Down Expand Up @@ -1026,7 +1032,13 @@ def get_query_column_names(self):
"""Return the column names associated with an executing query"""
return [description[0] for description in self.cursor.description]

def populate(self, database_path, columns=None, condition=None):
def populate(
self,
database_path,
columns=None,
condition=None,
attach_databases=None,
):
"""Populate the specified SQLite database.
The database is created if it does not exist.
If it exists, the populated tables are dropped
Expand All @@ -1046,12 +1058,9 @@ def populate(self, database_path, columns=None, condition=None):
will only get populated with the records associated with the
correspoing main table.
indexes is an array of table_name(indexed_column...) strings,
that specifies indexes to be created before populating the tables.
The indexes can be used to speed up the evaluation of the population
conditions.
Note that foreign key indexes will always be created and need
not be specified.
attach_databases is a list of colon joined tuples specifying
a database name and its path. These are attached and made
available to the row selection query.
"""

# pylint: disable=too-many-statements
Expand Down Expand Up @@ -1175,7 +1184,7 @@ def populate_table(table, partition_index, condition):
self.vdb.execute(log_sql(statement))
perf.log(f"Populate {table}")

def create_database_schema(columns):
def create_database_schema(columns, attach_databases):
"""Create the populated database, if needed"""
if not os.path.exists(database_path):
pdb = sqlite3.connect(database_path)
Expand All @@ -1186,6 +1195,18 @@ def create_database_schema(columns):
)
set_fast_writing(self.vdb)

for db_spec in attach_databases:
try:
(db_name, db_path) = db_spec.split(":")
except ValueError:
fail(
f"Invalid database specification: '{db_spec}'; expected name:path"
)
self.vdb.execute(
log_sql(f"ATTACH DATABASE '{db_path}' AS {db_name}")
)
self.attached_databases.append(db_name)

self.index_manager = IndexManager(self.vdb)

add_columns(
Expand Down Expand Up @@ -1262,7 +1283,9 @@ def create_matched_tables(matched_tables):

perf.log("Matched table creation")

create_database_schema(columns)
if attach_databases is None:
attach_databases = []
create_database_schema(columns, attach_databases)
# Populate all tables from the records of each file in sequence.
# This improves the locality of reference and through the constraint
# indexing and the file cache avoids opening, reading, decompressing,
Expand Down
39 changes: 39 additions & 0 deletions tests/test_crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from alexandria3k.file_cache import FileCache

DATABASE_PATH = "tests/tmp/crossref.db"
ATTACHED_DATABASE_PATH = "tests/tmp/attached.db"


class TestDoiNormalize(unittest.TestCase):
Expand Down Expand Up @@ -503,3 +504,41 @@ def test_work_column_subset_condition(self):
),
5,
)

class TestCrossrefPopulateAttachedDatabaseCondition(TestCrossrefPopulate):
"""Verify column specification and population of single table"""

@classmethod
def setUpClass(cls):
ensure_unlinked(DATABASE_PATH)
FileCache.file_reads = 0

# Create and populate attached database
ensure_unlinked(ATTACHED_DATABASE_PATH)
attached = sqlite3.connect(ATTACHED_DATABASE_PATH)
attached.execute("CREATE TABLE s_works(doi)")
attached.execute("INSERT INTO s_works VALUES('10.1016/j.bjps.2022.04.046')")
attached.commit()
attached.close()

# debug.set_flags(["log-sql"])
cls.crossref = crossref.Crossref("tests/data/sample")
cls.crossref.populate(
DATABASE_PATH,
["works.doi"],
"EXISTS (SELECT 1 FROM attached.s_works WHERE works.doi = s_works.doi)",
[f"attached:{ATTACHED_DATABASE_PATH}"]
)
cls.con = sqlite3.connect(DATABASE_PATH)
cls.cursor = cls.con.cursor()


@classmethod
def tearDownClass(cls):
cls.con.close()
os.unlink(DATABASE_PATH)
os.unlink(ATTACHED_DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("works"), 1)
self.assertEqual(FileCache.file_reads, 8)

0 comments on commit 87a9b39

Please sign in to comment.