diff --git a/.gitignore b/.gitignore index d18021646..2bb9b7ddd 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ utilities/rdbmigration/.work **/.classpath **/.project **/bin/ +/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/logs/ +/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/data/ +/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/previous-harvest/ diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/diff-additions.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/diff-additions.config.xml new file mode 100644 index 000000000..5da668a40 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/diff-additions.config.xml @@ -0,0 +1,87 @@ + + + + + + + + harvested-data.model.xml + + + + previous-harvest.model.xml + + + + + filename=data/vivo-additions.rdf.xml + + + + + + + + + + + + + + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/diff-subtractions.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/diff-subtractions.config.xml new file mode 100644 index 000000000..2b1455071 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/diff-subtractions.config.xml @@ -0,0 +1,86 @@ + + + + + + + + previous-harvest.model.xml + + + + harvested-data.model.xml + + + + filename=data/vivo-subtractions.rdf.xml + + + + + + + + + + + + + + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/harvested-data.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/harvested-data.model.xml new file mode 100644 index 000000000..e722f84a1 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/harvested-data.model.xml @@ -0,0 +1,111 @@ + + + + + + + tdb + data/harvested-data/ + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetch.bat b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetch.bat new file mode 100644 index 000000000..781a5e826 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetch.bat @@ -0,0 +1,93 @@ +@echo off + +IF exist data ( + rmdir /s /q data +) + +IF exist logs ( + rmdir /s /q logs +) + +REM set to the directory where the harvester was installed or unpacked +REM HARVESTER_INSTALL_DIR is set to the location of the installed harvester +REM If the deb file was used to install the harvester then the +REM directory should be set to /usr/share/vivo/harvester which is the +REM current location associated with the deb installation. +REM Since it is also possible the harvester was installed by +REM uncompressing the tar.gz the setting is available to be changed +REM and should agree with the installation location +set HARVESTER_INSTALL_DIR=C:\Users\KampeB\Dev\Harvester +set HARVEST_NAME=OpenAlex-Fetch +FOR %%A IN (%Date:/=%) DO SET Today=%%A + +REM set the CLASSPATH and HARVESTER_JAVA_OPTS to be used by all commands +set CLASSPATH=%HARVESTER_INSTALL_DIR%/build/harvester.jar;%HARVESTER_INSTALL_DIR%/build/dependency/* +set HARVESTER_JAVA_OPTS=-Xms1024M -Xmx2048M + +REM Execute Fetch +REM This stage of the script is where the information is gathered together into one local +REM place to facilitate the further steps of the harvest. The data is stored locally +REM in a format based off of the source. The format is a form of RDF but not in the VIVO ontology +echo Fetch from OpenAlex +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.fetch.JSONFetch -X openalexfetch.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Execute Translate +REM This is the part of the script where the input data is transformed into valid RDF +REM Translate will apply an xslt file to the fetched data which will result in the data +REM becoming valid RDF in the VIVO ontology +echo Translate data to valid RDF +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.translate.XSLTranslator -X xsltranslator.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Execute Transfer to import from record handler into local temp model +REM From this stage on the script places the data into a Jena model. A model is a +REM data storage structure similar to a database, but in RDF. +REM The harvester tool Transfer is used to move/add/remove/dump data in models. +echo Transfer RDF into temporary triple store +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -X transfer.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Perform an update +REM The harvester maintains copies of previous harvests in order to perform the same harvest twice +REM but only add the new statements, while removing the old statements that are no longer +REM contained in the input data. This is done in several steps of finding the old statements, +REM then the new statements, and then applying them to the Vivo main model. + +REM Find Subtractions +REM When making the previous harvest model agree with the current harvest, the statements that exist in +REM the previous harvest but not in the current harvest need to be identified for removal. +echo Find Subtractions +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.diff.Diff -X diff-subtractions.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Find Additions +REM When making the previous harvest model agree with the current harvest, the statements that exist in +REM the current harvest but not in the previous harvest need to be identified for addition. +echo Find Additions +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.diff.Diff -X diff-additions.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Apply Subtractions to Previous model +echo Apply Subtractions to Previous model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o previous-harvest.model.xml -r data/vivo-subtractions.rdf.xml -m + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Apply Additions to Previous model +echo Apply Additions to Previous model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o previous-harvest.model.xml -r data/vivo-additions.rdf.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Now that the changes have been applied to the previous harvest and the harvested data in vivo +REM agree with the previous harvest, the changes are now applied to the vivo model. +REM Apply Subtractions to VIVO model +echo Apply Subtractions to VIVO model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Apply Additions to VIVO model +echo Apply Additions to VIVO model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-additions.rdf.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +echo Harvest completed successfully diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetch.sh b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetch.sh new file mode 100644 index 000000000..b9178031e --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetch.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +export HARVESTER_INSTALL_DIR=/home/kampeb/vivo-fid-bau-1-12/openalex-harvester/ +export HARVEST_NAME=OpenAlex-Harvest +export DATE=`date +%Y-%m-%d'T'%T` + +# Add harvester binaries to path for execution +# The tools within this script refer to binaries supplied within the harvester +# Since they can be located in another directory their path should be +# included within the classpath and the path environment variables. +export PATH=$PATH:$HARVESTER_INSTALL_DIR/bin +export CLASSPATH=$HARVESTER_INSTALL_DIR/build/harvester.jar:$HARVESTER_INSTALL_DIR/build/dependency/* + +# Exit on first error +# The -e flag prevents the script from continuing even though a tool fails. +# Continuing after a tool failure is undesirable since the harvested +# data could be rendered corrupted and incompatible. +set -e + +# Supply the location of the detailed log file which is generated during the script. +# If there is an issue with a harvest, this file proves invaluable in finding +# a solution to the problem. It has become common practice in addressing a problem +# to request this file. The passwords and usernames are filtered out of this file +# to prevent these logs from containing sensitive information. +echo "Full Logging in $HARVEST_NAME.$DATE.log" +if [ ! -d logs ]; then + mkdir logs +fi +cd logs +touch $HARVEST_NAME.$DATE.log +ln -sf $HARVEST_NAME.$DATE.log $HARVEST_NAME.latest.log +cd .. + +#clear old data + +# For a fresh harvest, the removal of the previous information maintains data integrity. +# If you are continuing a partial run or wish to use the old and already retrieved +# data, you will want to comment out this line since it could prevent you from having +# the required harvest data. +rm -rf data + +# Execute Fetch +# This stage of the script is where the information is gathered together into one local +# place to facilitate the further steps of the harvest. The data is stored locally +# in a format based off of the source. The format is a form of RDF but not in the VIVO ontology +# The JDBCFetch tool in particular takes the data from the chosen source described in its +# configuration XML file and places it into record set in the flat RDF directly +# related to the rows, columns and tables described in the target database. +echo Execute jsonfetch from OpenAlex.org +harvester-jsonfetch -w DEBUG -X openAlexfetch.config.xml + +# Execute Translate +# This is the part of the script where the input data is transformed into valid RDF +# Translate will apply an xslt file to the fetched data which will result in the data +# becoming valid RDF in the VIVO ontology +echo Execute translate +harvester-xsltranslator -X xsltranslator.config.xml + +# Execute Transfer to import from record handler into local temp model +# From this stage on the script places the data into a Jena model. A model is a +# data storage structure similar to a database, but in RDF. +# The harvester tool Transfer is used to move/add/remove/dump data in models. +# For this call on the transfer tool: +# -s refers to the source translated records file, which was just produced by the translator step +# -o refers to the destination model for harvested data +# -d means that this call will also produce a text dump file in the specified location +echo Execute initial transfer to triple store +harvester-transfer -s translated-records.config.xml -o harvested-data.model.xml -d data/harvested-data/imported-records.rdf.xml + +# Perform an update +# The harvester maintains copies of previous harvests in order to perform the same harvest twice +# but only add the new statements, while removing the old statements that are no longer +# contained in the input data. This is done in several steps of finding the old statements, +# then the new statements, and then applying them to the Vivo main model. + +# Find Subtractions +# When making the previous harvest model agree with the current harvest, the statements that exist in +# the previous harvest but not in the current harvest need to be identified for removal. +echo Find Subtractions +harvester-diff -X diff-subtractions.config.xml + +# Find Additions +# When making the previous harvest model agree with the current harvest, the statements that exist in +# the current harvest but not in the previous harvest need to be identified for addition. +echo Find Additions +harvester-diff -X diff-additions.config.xml + +# Apply Subtractions to Previous model +echo Apply Subtractions to Previous model +harvester-transfer -o previous-harvest.model.xml -r data/vivo-subtractions.rdf.xml -m +# Apply Additions to Previous model +echo Apply Additions to Previous model +harvester-transfer -o previous-harvest.model.xml -r data/vivo-additions.rdf.xml + +# Now that the changes have been applied to the previous harvest and the harvested data in vivo +# agree with the previous harvest, the changes are now applied to the vivo model. +# Apply Subtractions to VIVO +echo Apply Subtractions to VIVO +harvester-transfer -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m +# Apply Additions to VIVO for pre-1.2 versions +echo Apply Additions to VIVO +harvester-transfer -o vivo.model.xml -r data/vivo-additions.rdf.xml + +#Output some counts +ORGS=`cat data/vivo-additions.rdf.xml | grep 'http://xmlns.com/foaf/0.1/Organization' | wc -l` +PEOPLE=`cat data/vivo-additions.rdf.xml | grep 'http://xmlns.com/foaf/0.1/Person' | wc -l` +POSITIONS=`cat data/vivo-additions.rdf.xml | grep 'positionForPerson' | wc -l` +echo "Imported $ORGS organizations, $PEOPLE people, and $POSITIONS positions" + +echo 'Harvest completed successfully' \ No newline at end of file diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetchPartly.sh b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetchPartly.sh new file mode 100644 index 000000000..888a2b0cd --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexFetchPartly.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +export HARVESTER_INSTALL_DIR=/home/kampeb/vivo-fid-bau-1-12/openalex-harvester/ +export HARVEST_NAME=OpenAlex-Harvest +export DATE=`date +%Y-%m-%d'T'%T` + +# Add harvester binaries to path for execution +# The tools within this script refer to binaries supplied within the harvester +# Since they can be located in another directory their path should be +# included within the classpath and the path environment variables. +export PATH=$PATH:$HARVESTER_INSTALL_DIR/bin +export CLASSPATH=$HARVESTER_INSTALL_DIR/build/harvester.jar:$HARVESTER_INSTALL_DIR/build/dependency/* + +# Exit on first error +# The -e flag prevents the script from continuing even though a tool fails. +# Continuing after a tool failure is undesirable since the harvested +# data could be rendered corrupted and incompatible. +set -e + +# Supply the location of the detailed log file which is generated during the script. +# If there is an issue with a harvest, this file proves invaluable in finding +# a solution to the problem. It has become common practice in addressing a problem +# to request this file. The passwords and usernames are filtered out of this file +# to prevent these logs from containing sensitive information. +echo "Full Logging in $HARVEST_NAME.$DATE.log" +if [ ! -d logs ]; then + mkdir logs +fi +cd logs +touch $HARVEST_NAME.$DATE.log +ln -sf $HARVEST_NAME.$DATE.log $HARVEST_NAME.latest.log +cd .. + +#clear old data + +# For a fresh harvest, the removal of the previous information maintains data integrity. +# If you are continuing a partial run or wish to use the old and already retrieved +# data, you will want to comment out this line since it could prevent you from having +# the required harvest data. +#rm -rf data + +# Execute Fetch +# This stage of the script is where the information is gathered together into one local +# place to facilitate the further steps of the harvest. The data is stored locally +# in a format based off of the source. The format is a form of RDF but not in the VIVO ontology +# The JDBCFetch tool in particular takes the data from the chosen source described in its +# configuration XML file and places it into record set in the flat RDF directly +# related to the rows, columns and tables described in the target database. +#echo Execute jsonfetch from OpenAlex.org +#harvester-jsonfetch -w DEBUG -X openAlexfetch.config.xml + +# Execute Translate +# This is the part of the script where the input data is transformed into valid RDF +# Translate will apply an xslt file to the fetched data which will result in the data +# becoming valid RDF in the VIVO ontology +#echo Execute translate +#harvester-xsltranslator -X xsltranslator.config.xml + +# Execute Transfer to import from record handler into local temp model +# From this stage on the script places the data into a Jena model. A model is a +# data storage structure similar to a database, but in RDF. +# The harvester tool Transfer is used to move/add/remove/dump data in models. +# For this call on the transfer tool: +# -s refers to the source translated records file, which was just produced by the translator step +# -o refers to the destination model for harvested data +# -d means that this call will also produce a text dump file in the specified location +#echo Execute initial transfer to triple store +#harvester-transfer -s translated-records.config.xml -o harvested-data.model.xml -d data/harvested-data/imported-records.rdf.xml + +# Perform an update +# The harvester maintains copies of previous harvests in order to perform the same harvest twice +# but only add the new statements, while removing the old statements that are no longer +# contained in the input data. This is done in several steps of finding the old statements, +# then the new statements, and then applying them to the Vivo main model. + +# Find Subtractions +# When making the previous harvest model agree with the current harvest, the statements that exist in +# the previous harvest but not in the current harvest need to be identified for removal. +#echo Find Subtractions +#harvester-diff -X diff-subtractions.config.xml + +# Find Additions +# When making the previous harvest model agree with the current harvest, the statements that exist in +# the current harvest but not in the previous harvest need to be identified for addition. +#echo Find Additions +#harvester-diff -X diff-additions.config.xml + +# Apply Subtractions to Previous model +#echo Apply Subtractions to Previous model +#harvester-transfer -o previous-harvest.model.xml -r data/vivo-subtractions.rdf.xml -m +# Apply Additions to Previous model +#echo Apply Additions to Previous model +#harvester-transfer -o previous-harvest.model.xml -r data/vivo-additions.rdf.xml + +# Now that the changes have been applied to the previous harvest and the harvested data in vivo +# agree with the previous harvest, the changes are now applied to the vivo model. +# Apply Subtractions to VIVO +echo Apply Subtractions to VIVO +harvester-transfer -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m +# Apply Additions to VIVO for pre-1.2 versions +echo Apply Additions to VIVO +harvester-transfer -o vivo.model.xml -r data/vivo-additions.rdf.xml + +# Output some counts +ORGS=`cat data/vivo-additions.rdf.xml | grep 'http://xmlns.com/foaf/0.1/Organization' | wc -l` +PEOPLE=`cat data/vivo-additions.rdf.xml | grep 'http://xmlns.com/foaf/0.1/Person' | wc -l` +POSITIONS=`cat data/vivo-additions.rdf.xml | grep 'positionForPerson' | wc -l` +echo "Imported $ORGS organizations, $PEOPLE people, and $POSITIONS positions" + +echo 'Harvest completed successfully' diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexfetch.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexfetch.config.xml new file mode 100644 index 000000000..8f0db16db --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openAlexfetch.config.xml @@ -0,0 +1,27 @@ + + + + # harvesting publications from TIB – Leibniz Information Centre for Science and Technology. exchange the ROR ID to test with your institution. + https://api.openalex.org/works?filter=authorships.institutions.ror:https://ror.org/04aj4c181&per-page=200&mailto=forschungsatlas@tib.eu&cursor=* + + # harvesting of publications from the subfield "Semantic Web and Ontology Develoment", should result in ca. 9-10k publications + + + # harvesting of publications from the journal "Quantitative Science Studies". + + + # FIDBAU + + + raw-records.config.xml + http://vivo.example.com/harvest/aims_users/ + publication + uid + $.results + INFO + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openalex-to-vivo.datamap.xsl b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openalex-to-vivo.datamap.xsl new file mode 100644 index 000000000..f318495de --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/openalex-to-vivo.datamap.xsl @@ -0,0 +1,1513 @@ + + + + + + + https://forschungsatlas.fid-bau.de/individual/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + T00:00:00 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Authorship for + + + Authorship for + + + + + + + + + + + + , + + + + + + + + + + + + vCard for: + + + + + + + + + + + vCard name for: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + journal-article + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/previous-harvest.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/previous-harvest.model.xml new file mode 100644 index 000000000..6542cb903 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/previous-harvest.model.xml @@ -0,0 +1,10 @@ + + + + tdb + previous-harvest + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/raw-records.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/raw-records.config.xml new file mode 100644 index 000000000..8fd6c045f --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/raw-records.config.xml @@ -0,0 +1,299 @@ + + + + + + + + + + + + + + + + + + + + + + + + org.vivoweb.harvester.util.repo.TextFileRecordHandler + data/raw-records + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/transfer.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/transfer.config.xml new file mode 100644 index 000000000..5a46d1df1 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/transfer.config.xml @@ -0,0 +1,38 @@ + + + + + INFO + + + translated-records.config.xml + + + harvested-data.model.xml + data/harvested-data/imported-records.rdf.xml + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/translated-records.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/translated-records.config.xml new file mode 100644 index 000000000..4835eda84 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/translated-records.config.xml @@ -0,0 +1,299 @@ + + + + + + + + + + + + + + + + + + + + + + + + org.vivoweb.harvester.util.repo.TextFileRecordHandler + data/translated-records + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/usage.txt b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/usage.txt new file mode 100644 index 000000000..efb259765 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/usage.txt @@ -0,0 +1,20 @@ +usage: JSONFetch + -d,--description a descriptive name for the json + object [have multiple -d for more + names] + -f,--file file containing json + -h,--help Help Message + -i,--id a single id for the json object + [have multiple -i for more ids] + -n,--namespaceBase the base namespace to use for each + node created + -o,--output RecordHandler config file path + -O,--outputOverride override the RH_PARAM of output + recordhandler using VALUE + -p,--path a single path for the json object + [have multiple -p for more json + paths] + -u,--url url which produces json + -w,--wordiness Set the console log level + -X,--config XML Configuration File + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/vivo.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/vivo.model.xml new file mode 100644 index 000000000..274049f41 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/vivo.model.xml @@ -0,0 +1,104 @@ + + + + + tdb + /opt/data/forschungsatlas/tdbContentModels + http://vitro.mannlib.cornell.edu/default/vitro-kb-2 + \ No newline at end of file diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/xsltranslator.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/xsltranslator.config.xml new file mode 100644 index 000000000..d3a824fb2 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.13-1.15-examples/example-openalex/xsltranslator.config.xml @@ -0,0 +1,17 @@ + + + + INFO + + raw-records.config.xml + + + translated-records.config.xml + + + openalex-to-vivo.datamap.xsl + diff --git a/src/main/java/org/vivoweb/harvester/fetch/JSONFetch.java b/src/main/java/org/vivoweb/harvester/fetch/JSONFetch.java index 2c47bfaee..5c1dc0915 100644 --- a/src/main/java/org/vivoweb/harvester/fetch/JSONFetch.java +++ b/src/main/java/org/vivoweb/harvester/fetch/JSONFetch.java @@ -5,18 +5,18 @@ ******************************************************************************/ package org.vivoweb.harvester.fetch; -import java.io.IOException; +import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import net.minidev.json.JSONObject; import net.minidev.json.JSONArray; +import net.minidev.json.parser.JSONParser; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.vivoweb.harvester.util.FileAide; -import org.vivoweb.harvester.util.InitLog; -import org.vivoweb.harvester.util.SpecialEntities; -import org.vivoweb.harvester.util.WebAide; +import org.vivoweb.harvester.util.*; import org.vivoweb.harvester.util.args.ArgDef; import org.vivoweb.harvester.util.args.ArgList; import org.vivoweb.harvester.util.args.ArgParser; @@ -289,8 +289,9 @@ private String buildNodeTypeNS(String nodeName) { * @throws IOException error getting recrords */ public void execute() throws IOException { - + JSONParser parser = new JSONParser(); String jsonpath = new String(); + List nodes = null; try { XMLRecordOutputStream xmlRos = xmlRosBase.clone(); @@ -298,42 +299,60 @@ public void execute() throws IOException { // Get json contents as String, check for url first then a file - String jsonString = new String(); + String jsonString = null; if (this.strAddress == null) { - System.out.println(getParser().getUsage()); - System.exit(1); + System.out.println(getParser().getUsage()); + System.exit(1); } - if (this.strAddress.startsWith("http:")) { - jsonString = WebAide.getURLContents(this.strAddress); + if (this.strAddress.startsWith("http") && this.strAddress.contains("cursor=")) { + nodes = paging(); + } else if (this.strAddress.startsWith("http") && !this.strAddress.contains("cursor=")) { + log.debug("URL: " + this.strAddress); + jsonString = WebAide.getURLContents(this.strAddress); } else { - jsonString = FileAide.getTextContent(this.strAddress); + jsonString = FileAide.getTextContent(this.strAddress); } //log.info(jsonString); - for (int i=0; i < this.nodeNames.length ; i++) { + for (int i = 0; i < this.nodeNames.length; i++) { String name = this.nodeNames[i]; String id = this.idStrings[i]; jsonpath = this.pathStrings[i]; - log.info("Using path: "+ jsonpath); + log.info("Using path: " + jsonpath); JsonPath path = JsonPath.compile(jsonpath); - log.info("got jsonpath: "+ path.getPath()); - List nodes = path.read(jsonString); - log.info("name: "+ name); + log.info("got jsonpath: " + path.getPath()); + if (jsonString != null) { + log.info(jsonString); + nodes = path.read(jsonString); + } + log.info("name: " + name); //log.info("id: "+ id); log.info("num nodes: " + nodes.size()); int count = 0; - for (Object o: nodes) { - JSONObject jsonObject = (JSONObject) o; + Iterator itr = nodes.iterator(); + + while (itr.hasNext()) { + Double relevantScore = 0.0; + JSONObject jsonObject = (JSONObject) parser.parse(itr.next().toString()); + JSONArray conceptArray = (JSONArray) jsonObject.get("concepts"); + + if (jsonObject.get("title") == null) { + itr.remove(); + } + } + + for (Object o : nodes) { + JSONObject jsonObject = (JSONObject) parser.parse(o.toString()); Iterator iter = jsonObject.keySet().iterator(); StringBuilder sb = new StringBuilder(); //log.info("fixedkey: "+ fixedkey); StringBuilder recID = new StringBuilder(); recID.append("node_-_"); - recID.append(String.valueOf(count)); + recID.append(count); - //log.trace("Creating RDF for "+name+": "+recID); + log.trace("Creating RDF for "+name+": "+recID); // Build RDF BEGIN // Header info String nodeNS = "node-" + name; @@ -363,68 +382,230 @@ public void execute() throws IOException { String key = (String) iter.next(); Object val = jsonObject.get(key); if (val == null) { - val = ""; + val = ""; } //log.info("val type for key: "+key+ ": "+val.getClass().getName()); - String fixedkey = key.replaceAll(" ","_"); - String field = nodeNS + ":" + fixedkey; - sb.append(getFieldXml(field, val)); + String fixedkey = key + .replaceAll(" |/", "_") + .replaceAll("\\(|\\)", "") + .replaceAll("/", "_"); + if (!Character.isDigit(fixedkey.charAt(0)) && !fixedkey.equals("abstract_inverted_index")) { + // Confident JSON node names contain "Event:A6bdb69a-e51d-42d7-bd25-62ec3c40b7e8" + if (fixedkey.contains(":")) + fixedkey = fixedkey.substring(fixedkey.indexOf(":") + 1); + String field = nodeNS + ":" + fixedkey; + sb.append(getFieldXml(field, val, fixedkey)); + } + } + // Record info END + sb.append(" \n"); + + // Footer info + sb.append(""); + // Build RDF END + + // Write RDF to RecordHandler + //log.trace("Adding record: " + fixedkey + "_" + recID); + //log.trace("data: "+ sb.toString()); + //log.info("rhOutput: "+ this.rhOutput); + //log.info("recID: "+recID); + this.rhOutput.addRecord(name + "_" + recID, sb.toString(), this.getClass()); + count++; } - // Record info END - sb.append(" \n"); - - // Footer info - sb.append(""); - // Build RDF END - - // Write RDF to RecordHandler - //log.trace("Adding record: " + fixedkey + "_" + recID); - //log.trace("data: "+ sb.toString()); - //log.info("rhOutput: "+ this.rhOutput); - //log.info("recID: "+recID); - this.rhOutput.addRecord(name + "_" + recID, sb.toString(), this.getClass()); - count++; + } + } catch(InvalidPathException e){ + log.error("Invalid JsonPath: " + jsonpath); + } catch(Exception e){ + log.error(e.getMessage()); + e.printStackTrace(); + throw new IOException(e); + } + } + + private ArrayList paging() throws IOException { + ArrayList listdata = new ArrayList<>(); + String cursor, url_without_cursor; + int per_page,count,dbTime, pages = 0, i=1; + String jsonString; + JsonPath path, metapath; + boolean displayed = false; + JSONArray resultPart = new JSONArray(); + + JSONObject jsonObject; + +// FileWriter file = new FileWriter("openalex.json"); +// file.write("results:"); + + cursor = (String) this.strAddress.subSequence(this.strAddress.indexOf("cursor=")+7, this.strAddress.length()); + url_without_cursor = (String) this.strAddress.subSequence(0, this.strAddress.indexOf("cursor=")); + log.debug("URL: "+this.strAddress); + + metapath = JsonPath.compile("$.meta"); + + while (cursor != null) { + jsonString = WebAide.getURLContents(url_without_cursor+"cursor="+cursor); + jsonObject = metapath.read(jsonString); + + // get next cursor till if there is a next one + if (jsonObject.get("next_cursor") != null) + cursor = jsonObject.get("next_cursor").toString(); + else + cursor = null; + +// log.debug("Next cursor: "+cursor); + + // get meta informations + if (!displayed) { + dbTime = Integer.parseInt(jsonObject.get("db_response_time_ms").toString()); + log.debug("DB response time [ms]: "+dbTime); + per_page = Integer.parseInt(jsonObject.get("per_page").toString()); + log.debug("Objects per page: "+per_page); + count = Integer.parseInt(jsonObject.get("count").toString()); + log.debug("Total amount of objects: "+count); + pages = (count + per_page - 1) / per_page; + displayed = true; + } + + if (cursor != null) { + log.debug("Page Number: "+i+"/"+pages); + // get data + path = JsonPath.compile(this.pathStrings[0]); + resultPart = path.read(jsonString); + for (int k=0; k"); + + // insert field value + if (val instanceof JSONArray) { + log.debug(field+" is an array with "+((JSONArray) val).size()+" elements") ; + XMLTagIndexing xmlTagIndexing = new XMLTagIndexing(); + xmlTagIndexing.setElementNo(0); + arrayHandlingV2(val, sb, xmlTagIndexing, fixedkey); + } if (val instanceof JSONObject) { + log.debug(field+" is an object with "+((JSONObject) val).size()+" elements") ; + objectHandling(val, sb); + } else if (val instanceof String || val instanceof Integer){ + sb.append(SpecialEntities.xmlEncode(val.toString().trim() + .replaceAll("\u201D", "'") + .replaceAll("\u201C","'"))); + } + // Field END + sb.append("\n"); + return sb.toString(); + } + + private void objectHandling(Object val, StringBuffer sb) { + Iterator objectIterator; + + JSONObject jsonObject = (JSONObject) val; + objectIterator = jsonObject.keySet().iterator(); + + while (objectIterator.hasNext()) { + + String key = (String) objectIterator.next(); + Object objVal = jsonObject.get(key); + if (objVal == null) { + objVal = ""; + } + + key = key.replaceAll("/","_"); +// .replaceAll("\\(","_") +// .replaceAll("\\)","_") +// .replaceAll("'","_") +// .replaceAll(",","_") +// .replaceAll(".","_"); + + if (!Character.isDigit(key.charAt(0))) { + + log.debug("field: "+key); +// sb.append(getTagName(field, objVal)); + sb.append(getFieldXml(key, objVal, key)); } } - } catch (InvalidPathException e) { - log.error("Invalid JsonPath: "+ jsonpath); - } catch(Exception e) { - log.error(e.getMessage()); - e.printStackTrace(); - throw new IOException(e); } - } - - public String getFieldXml(String field, Object val) { - StringBuffer sb = new StringBuffer(); - //log.debug("val type for field "+ field +": "+val.getClass().getName()); - sb.append(" <"); - sb.append(SpecialEntities.xmlEncode(field)); - sb.append(">"); - - // insert field value - if (val instanceof JSONArray) { - JSONArray array = (JSONArray) val; - log.debug("field is an array: "+ field); - Iterator iter = array.iterator(); - while (iter.hasNext()) { - Object obj = iter.next(); - log.debug("objtype: "+ obj.getClass().getName()); - log.debug("val: "+ array.toString()); - } - } else { - sb.append(SpecialEntities.xmlEncode(val.toString().trim())); - } - // Field END - sb.append("\n"); - return sb.toString(); - } + private void arrayHandlingV2(Object val, StringBuffer sb, XMLTagIndexing xmlTagIndexing, String fixedkey) { + JSONArray array = (JSONArray) val; + sb.append("\n"); + Iterator arrayIterator = array.iterator(); + log.debug("val: "+ val); + while (arrayIterator.hasNext()) { + Object obj = arrayIterator.next(); + + if (!xmlTagIndexing.isArrayIndexOpen()) { + xmlTagIndexing.setArrayIndexOpen(); + String lastChar = fixedkey.substring(fixedkey.length() - 1); + if (lastChar.equals("s")) + xmlTagIndexing.setXmlTagName(StringUtils.chop(fixedkey)); + else + xmlTagIndexing.setXmlTagName(fixedkey); + sb.append(" <"+xmlTagIndexing.getXmlTagName()+"_"+ xmlTagIndexing.getElementNo() +">"); + } + +// log.debug("val: "+ obj); + + if (obj instanceof JSONArray) { + log.debug("there is an JSON Array inside: "+ obj); + XMLTagIndexing xmlArrayIndexing = new XMLTagIndexing(); + xmlArrayIndexing.setElementNo(0); + + arrayHandlingV2(val, sb, xmlArrayIndexing, fixedkey); + } else if (obj instanceof JSONObject) { + log.debug("there is an JSON Object inside: "+ obj); + objectHandling(obj, sb); + } + else { + sb.append(obj.toString() + .replaceAll("&", "&") + .replaceAll("
","") + .replaceAll("<","") + .replaceAll(">","")); + } + if (xmlTagIndexing.isArrayIndexOpen()) { + sb.append("\n"); + xmlTagIndexing.increaseElementNo(); + xmlTagIndexing.setArrayIndexClosed(); + } + } + sb.append(" "); + } + + public String getTagName(String field, Object val) { + StringBuffer sb = new StringBuffer(); + log.debug("val type for tag "+ field +": "+val.getClass().getName()); + sb.append(" <"); + sb.append(SpecialEntities.xmlEncode(field)); + sb.append(">"); + + // insert field value + sb.append(SpecialEntities.xmlEncode(val.toString().trim())); + + // Field END + sb.append("\n"); + return sb.toString(); + } @Override public void writeRecord(String id, String data) throws IOException { diff --git a/src/main/java/org/vivoweb/harvester/util/XMLTagIndexing.java b/src/main/java/org/vivoweb/harvester/util/XMLTagIndexing.java new file mode 100644 index 000000000..d7af42176 --- /dev/null +++ b/src/main/java/org/vivoweb/harvester/util/XMLTagIndexing.java @@ -0,0 +1,40 @@ +package org.vivoweb.harvester.util; + +public class XMLTagIndexing { + + private String xmlTagName = null; + private int elementNo = 0; + private boolean arrayIndexOpen = false; + + public String getXmlTagName() { + return xmlTagName; + } + + public void setXmlTagName(String tagName) { + this.xmlTagName = tagName; + } + + public int getElementNo() { + return elementNo; + } + + public void setElementNo(int elementNo) { + this.elementNo = elementNo; + } + + public void increaseElementNo() { + this.elementNo++; + } + + public boolean isArrayIndexOpen() { + return arrayIndexOpen; + } + + public void setArrayIndexOpen() { + this.arrayIndexOpen = true; + } + + public void setArrayIndexClosed() { + this.arrayIndexOpen = false; + } +}