Allow row selection based on attached database

dspinellis · Jan 29, 2023 · 87a9b39 · 87a9b39
1 parent 87b7a14
commit 87a9b39
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -246,6 +246,16 @@ SELECT COUNT(*) FROM funder_awards;
 SELECT COUNT(*) FROM work_references;
 ```
 
+### Record selection from external database
+The following command creates an SQLite database with all Crossref data
+of works whose DOI appears in the attached database named `selected.db`.
+```sh
+alexandria3k --data-source Crossref 'April 2022 Public Data File from Crossref' \
+   --populate-db-path selected-works.db \
+   --attach-databases 'attached:selected.db' \
+   --row-selection "EXISTS (SELECT 1 FROM attached.selected_dois WHERE works.doi = selected_dois.doi)"
+```
+
 ### Populate the database with author records from ORCID
 Only records of authors identified in the publications through an
 ORCID will be added.
@@ -332,7 +342,8 @@ SELECT author_affiliations.name FROM
 ## Command-line options reference
 <!-- CLI start -->
 ```
-usage: alexandria3k [-h] [-c COLUMNS [COLUMNS ...]] [-D DEBUG [DEBUG ...]]
+usage: alexandria3k [-h] [-a ATTACH_DATABASES [ATTACH_DATABASES ...]]
+                    [-c COLUMNS [COLUMNS ...]] [-D DEBUG [DEBUG ...]]
                     [-d DATA_SOURCE [DATA_SOURCE ...]] [-E OUTPUT_ENCODING]
                     [-F FIELD_SEPARATOR] [-H] [-i [INDEX ...]]
                     [-L LIST_SCHEMA] [-l LINKED_RECORDS] [-n] [-o OUTPUT] [-P]
@@ -344,6 +355,8 @@ alexandria3k: Publication metadata interface
 
 optional arguments:
   -h, --help            show this help message and exit
+  -a ATTACH_DATABASES [ATTACH_DATABASES ...], --attach-databases ATTACH_DATABASES [ATTACH_DATABASES ...]
+                        Databases to attach for the row selection query
   -c COLUMNS [COLUMNS ...], --columns COLUMNS [COLUMNS ...]
                         Columns to populate using table.column or table.*
   -D DEBUG [DEBUG ...], --debug DEBUG [DEBUG ...]
@@ -496,6 +509,17 @@ crossref_instance.populate(
 )
 ```
 
+### Record selection from external database
+The following command creates an SQLite database with all Crossref data
+of works whose DOI appears in the attached database named `selected.db`.
+```py
+crossref_instance.populate(
+    "selected-works.db",
+    condition="EXISTS (SELECT 1 FROM attached.selected_dois WHERE works.doi = selected_dois.doi)",
+    ["attached:selected.db"]
+)
+```
+
 ### Populate the database from ORCID
 Add tables containing author country and education organization.
 Only records of authors identified in the publications through an

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "alexandria3k"
-version = "2.3.3"
+version = "2.4.0"
 authors = [
   { name="Diomidis Spinellis", email="[email protected]" },
 ]

diff --git a/src/alexandria3k/__main__.py b/src/alexandria3k/__main__.py
@@ -173,6 +173,13 @@ def sql_value(database, statement):
 def parse_cli_arguments(parser, args=None):
     """Parse command line arguments (or args e.g. when testing)"""
 
+    parser.add_argument(
+        "-a",
+        "--attach-databases",
+        nargs="+",
+        type=str,
+        help="Databases to attach for the row selection query",
+    )
     parser.add_argument(
         "-c",
         "--columns",
@@ -431,7 +438,10 @@ def main():
 
     if crossref_instance and args.populate_db_path:
         crossref_instance.populate(
-            args.populate_db_path, args.columns, args.row_selection
+            args.populate_db_path,
+            args.columns,
+            args.row_selection,
+            args.attach_databases,
         )
         debug.log("files-read", f"{FileCache.file_reads} files read")
         perf.log("Crossref table population")

diff --git a/src/alexandria3k/crossref.py b/src/alexandria3k/crossref.py
@@ -879,6 +879,7 @@ class Crossref:
     """Create a Crossref meta-data object that support queries over its
     (virtual) table and the population of an SQLite database with its
     data"""
+    # pylint: disable=too-many-instance-attributes
 
     def __init__(self, crossref_directory, sample=lambda n: True):
         # A named in-memory database; it can be attached by name to others
@@ -899,6 +900,7 @@ def __init__(self, crossref_directory, sample=lambda n: True):
         self.population_columns = {}
         self.query_and_population_columns = {}
         self.index_manager = None
+        self.attached_databases = []
 
         for table in tables:
             self.vdb.execute(
@@ -929,9 +931,13 @@ def trace_query_columns(query):
             self.query_columns.
             See https://rogerbinns.github.io/apsw/tips.html#parsing-sql"""
 
-            def authorizer(op_code, table, column, _database, _trigger):
+            def authorizer(op_code, table, column, database, _trigger):
                 """Query authorizer to monitor used columns"""
-                if op_code == apsw.SQLITE_READ and column:
+                if (
+                    op_code == apsw.SQLITE_READ
+                    and column
+                    and database not in self.attached_databases
+                ):
                     # print(f"AUTH: adding {table}.{column}")
                     Crossref.add_column(self.query_columns, table, column)
                 return apsw.SQLITE_OK
@@ -1026,7 +1032,13 @@ def get_query_column_names(self):
         """Return the column names associated with an executing query"""
         return [description[0] for description in self.cursor.description]
 
-    def populate(self, database_path, columns=None, condition=None):
+    def populate(
+        self,
+        database_path,
+        columns=None,
+        condition=None,
+        attach_databases=None,
+    ):
         """Populate the specified SQLite database.
         The database is created if it does not exist.
         If it exists, the populated tables are dropped
@@ -1046,12 +1058,9 @@ def populate(self, database_path, columns=None, condition=None):
         will only get populated with the records associated with the
         correspoing main table.
 
-        indexes is an array of table_name(indexed_column...)  strings,
-        that specifies indexes to be created before populating the tables.
-        The indexes can be used to speed up the evaluation of the population
-        conditions.
-        Note that foreign key indexes will always be created and need
-        not be specified.
+        attach_databases is a list of colon joined tuples specifying
+        a database name and its path.  These are attached and made
+        available to the row selection query.
         """
 
         # pylint: disable=too-many-statements
@@ -1175,7 +1184,7 @@ def populate_table(table, partition_index, condition):
             self.vdb.execute(log_sql(statement))
             perf.log(f"Populate {table}")
 
-        def create_database_schema(columns):
+        def create_database_schema(columns, attach_databases):
             """Create the populated database, if needed"""
             if not os.path.exists(database_path):
                 pdb = sqlite3.connect(database_path)
@@ -1186,6 +1195,18 @@ def create_database_schema(columns):
             )
             set_fast_writing(self.vdb)
 
+            for db_spec in attach_databases:
+                try:
+                    (db_name, db_path) = db_spec.split(":")
+                except ValueError:
+                    fail(
+                        f"Invalid database specification: '{db_spec}'; expected name:path"
+                    )
+                self.vdb.execute(
+                    log_sql(f"ATTACH DATABASE '{db_path}' AS {db_name}")
+                )
+                self.attached_databases.append(db_name)
+
             self.index_manager = IndexManager(self.vdb)
 
             add_columns(
@@ -1262,7 +1283,9 @@ def create_matched_tables(matched_tables):
 
             perf.log("Matched table creation")
 
-        create_database_schema(columns)
+        if attach_databases is None:
+            attach_databases = []
+        create_database_schema(columns, attach_databases)
         # Populate all tables from the records of each file in sequence.
         # This improves the locality of reference and through the constraint
         # indexing and the file cache avoids opening, reading, decompressing,

diff --git a/tests/test_crossref.py b/tests/test_crossref.py
@@ -31,6 +31,7 @@
 from alexandria3k.file_cache import FileCache
 
 DATABASE_PATH = "tests/tmp/crossref.db"
+ATTACHED_DATABASE_PATH = "tests/tmp/attached.db"
 
 
 class TestDoiNormalize(unittest.TestCase):
@@ -503,3 +504,41 @@ def test_work_column_subset_condition(self):
                 ),
                 5,
             )
+
+class TestCrossrefPopulateAttachedDatabaseCondition(TestCrossrefPopulate):
+    """Verify column specification and population of single table"""
+
+    @classmethod
+    def setUpClass(cls):
+        ensure_unlinked(DATABASE_PATH)
+        FileCache.file_reads = 0
+
+        # Create and populate attached database
+        ensure_unlinked(ATTACHED_DATABASE_PATH)
+        attached = sqlite3.connect(ATTACHED_DATABASE_PATH)
+        attached.execute("CREATE TABLE s_works(doi)")
+        attached.execute("INSERT INTO s_works VALUES('10.1016/j.bjps.2022.04.046')")
+        attached.commit()
+        attached.close()
+
+        # debug.set_flags(["log-sql"])
+        cls.crossref = crossref.Crossref("tests/data/sample")
+        cls.crossref.populate(
+            DATABASE_PATH,
+            ["works.doi"],
+            "EXISTS (SELECT 1 FROM attached.s_works WHERE works.doi = s_works.doi)",
+            [f"attached:{ATTACHED_DATABASE_PATH}"]
+        )
+        cls.con = sqlite3.connect(DATABASE_PATH)
+        cls.cursor = cls.con.cursor()
+
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.con.close()
+        os.unlink(DATABASE_PATH)
+        os.unlink(ATTACHED_DATABASE_PATH)
+
+    def test_counts(self):
+        self.assertEqual(self.record_count("works"), 1)
+        self.assertEqual(FileCache.file_reads, 8)