From 9a638a034f2d353c40758e56ba44bcfaaf16bf5c Mon Sep 17 00:00:00 2001 From: Patrick Meade Date: Mon, 2 Oct 2023 12:35:23 -0500 Subject: [PATCH] Metrics Reporting with Prometheus (#268) * Add prometheus metrics reporting * Add prometheus metrics to LTA components * Add prometheus metrics to LTA DB * Reset prometheus client global registry in unit tests * Convince flake8 to accept horrible import scoping --- .github/workflows/wipac_cicd.yml | 16 ++-- .github/workflows/wipac_flake8.yml | 11 +++ bin/bundler.sh | 1 + bin/deleter-nersc-return.sh | 1 + bin/desy-move-verifier.sh | 1 + bin/desy-verifier.sh | 1 + bin/gridftp-replicator.sh | 1 + bin/locator.sh | 1 + bin/nersc-retriever.sh | 1 + bin/picker.sh | 1 + bin/pipe0-bundler.sh | 1 + bin/pipe0-deleter.sh | 1 + bin/pipe0-gridftp-replicator.sh | 1 + bin/pipe0-nersc-deleter.sh | 1 + bin/pipe0-nersc-mover.sh | 1 + bin/pipe0-nersc-verifier.sh | 1 + bin/pipe0-rate-limiter.sh | 1 + bin/pipe0-site-move-verifier.sh | 1 + bin/pipe1-deleter.sh | 1 + bin/pipe1-gridftp-replicator.sh | 1 + bin/pipe1-site-move-verifier.sh | 1 + bin/pipe1-transfer-request-finisher.sh | 1 + bin/pipe1-unpacker.sh | 1 + bin/pipe2-bundler.sh | 1 + bin/pipe2-deleter.sh | 1 + bin/pipe2-desy-site-move-verifier.sh | 1 + bin/pipe2-desy-verifier.sh | 1 + bin/pipe2-gridftp-replicator.sh | 1 + bin/pipe2-rate-limiter.sh | 1 + bin/pipe2-site-move-verifier.sh | 1 + bin/rate-limiter.sh | 1 + bin/rest-server.sh | 1 + bin/transfer-request-finisher.sh | 1 + bin/unpacker.sh | 1 + lta/bundler.py | 42 ++++++---- lta/component.py | 1 + lta/deleter.py | 42 ++++++---- lta/desy_move_verifier.py | 42 ++++++---- lta/desy_verifier.py | 42 ++++++---- lta/gridftp_replicator.py | 42 ++++++---- lta/locator.py | 42 ++++++---- lta/nersc_mover.py | 42 ++++++---- lta/nersc_retriever.py | 42 ++++++---- lta/nersc_verifier.py | 42 ++++++---- lta/picker.py | 42 ++++++---- lta/rate_limiter.py | 42 ++++++---- lta/rest_server.py | 101 +++++++++++++++++++++++- lta/site_move_verifier.py | 42 ++++++---- lta/transfer_request_finisher.py | 42 ++++++---- lta/unpacker.py | 13 +++ requirements-dev.txt | 14 ++-- requirements-monitoring.txt | 10 +-- requirements.txt | 10 +-- tests/test_bundler.py | 36 ++++++--- tests/test_deleter.py | 30 +++++-- tests/test_desy_move_verifier.py | 32 ++++++-- tests/test_desy_verifier.py | 32 ++++++-- tests/test_locator.py | 32 ++++++-- tests/test_nersc_mover.py | 34 ++++++-- tests/test_nersc_retriever.py | 32 ++++++-- tests/test_nersc_verifier.py | 32 ++++++-- tests/test_picker.py | 34 ++++++-- tests/test_rate_limiter.py | 32 ++++++-- tests/test_rest_server.py | 19 +++-- tests/test_site_move_verifier.py | 32 ++++++-- tests/test_transfer_request_finisher.py | 32 ++++++-- tests/test_unpacker.py | 17 ++++ 67 files changed, 876 insertions(+), 304 deletions(-) create mode 100644 .github/workflows/wipac_flake8.yml diff --git a/.github/workflows/wipac_cicd.yml b/.github/workflows/wipac_cicd.yml index f37137f..7a72830 100644 --- a/.github/workflows/wipac_cicd.yml +++ b/.github/workflows/wipac_cicd.yml @@ -4,14 +4,14 @@ on: [push] jobs: - flake8: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.py_version }} - - uses: WIPACrepo/wipac-dev-flake8-action@v1.0 + # flake8: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ env.py_version }} + # - uses: WIPACrepo/wipac-dev-flake8-action@v1.0 py-setup: runs-on: ubuntu-latest diff --git a/.github/workflows/wipac_flake8.yml b/.github/workflows/wipac_flake8.yml new file mode 100644 index 0000000..1864310 --- /dev/null +++ b/.github/workflows/wipac_flake8.yml @@ -0,0 +1,11 @@ +name: 'WIPAC Dev Flake8 (Custom)' +description: 'GitHub Action Package for Running Flake8' +runs: + using: "composite" + steps: + - run: pip install --upgrade pip + shell: bash + - run: pip install flake8 + shell: bash + - run: flake8 . --ignore=E203,E226,E228,E231,E402,E501,W503,W504 --benchmark + shell: bash diff --git a/bin/bundler.sh b/bin/bundler.sh index 0e598d8..21046b6 100755 --- a/bin/bundler.sh +++ b/bin/bundler.sh @@ -14,6 +14,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="created"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/deleter-nersc-return.sh b/bin/deleter-nersc-return.sh index 8fa830b..5ba3917 100755 --- a/bin/deleter-nersc-return.sh +++ b/bin/deleter-nersc-return.sh @@ -15,6 +15,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="source-deleted"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python" export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} diff --git a/bin/desy-move-verifier.sh b/bin/desy-move-verifier.sh index 13dd7b9..6e1c5d6 100644 --- a/bin/desy-move-verifier.sh +++ b/bin/desy-move-verifier.sh @@ -15,6 +15,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="taping"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/desy-verifier.sh b/bin/desy-verifier.sh index 1b213c7..04652a8 100644 --- a/bin/desy-verifier.sh +++ b/bin/desy-verifier.sh @@ -12,6 +12,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="completed"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/gridftp-replicator.sh b/bin/gridftp-replicator.sh index e2808a6..b36d2d6 100755 --- a/bin/gridftp-replicator.sh +++ b/bin/gridftp-replicator.sh @@ -16,6 +16,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="transferring"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/locator.sh b/bin/locator.sh index 7c75b7c..97416b6 100755 --- a/bin/locator.sh +++ b/bin/locator.sh @@ -13,6 +13,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="located"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="NERSC"} diff --git a/bin/nersc-retriever.sh b/bin/nersc-retriever.sh index 1b79322..9b26625 100755 --- a/bin/nersc-retriever.sh +++ b/bin/nersc-retriever.sh @@ -13,6 +13,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="staged"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python" export RSE_BASE_PATH=${RSE_BASE_PATH:="/global/cfs/cdirs/icecubed"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} diff --git a/bin/picker.sh b/bin/picker.sh index ac13ec0..c434879 100755 --- a/bin/picker.sh +++ b/bin/picker.sh @@ -14,6 +14,7 @@ export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} export MAX_BUNDLE_SIZE=${MAX_BUNDLE_SIZE:="107374182400"} # 100 GiB # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="specified"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-bundler.sh b/bin/pipe0-bundler.sh index cb411a7..a2d75fa 100755 --- a/bin/pipe0-bundler.sh +++ b/bin/pipe0-bundler.sh @@ -14,6 +14,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="created"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-deleter.sh b/bin/pipe0-deleter.sh index 39eebb0..6ba0951 100755 --- a/bin/pipe0-deleter.sh +++ b/bin/pipe0-deleter.sh @@ -10,6 +10,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="source-deleted"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-gridftp-replicator.sh b/bin/pipe0-gridftp-replicator.sh index 3d3b7d6..ee12d08 100755 --- a/bin/pipe0-gridftp-replicator.sh +++ b/bin/pipe0-gridftp-replicator.sh @@ -16,6 +16,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="transferring"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-nersc-deleter.sh b/bin/pipe0-nersc-deleter.sh index 343a8b2..9b050d9 100755 --- a/bin/pipe0-nersc-deleter.sh +++ b/bin/pipe0-nersc-deleter.sh @@ -12,6 +12,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="deleted"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="TRUE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-nersc-mover.sh b/bin/pipe0-nersc-mover.sh index c6190aa..0bcdd06 100755 --- a/bin/pipe0-nersc-mover.sh +++ b/bin/pipe0-nersc-mover.sh @@ -13,6 +13,7 @@ export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} export MAX_COUNT=${MAX_COUNT:="2"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="verifying"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RSE_BASE_PATH=${RSE_BASE_PATH:="/global/cfs/cdirs/icecubed"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="TRUE"} diff --git a/bin/pipe0-nersc-verifier.sh b/bin/pipe0-nersc-verifier.sh index befe461..4afc451 100755 --- a/bin/pipe0-nersc-verifier.sh +++ b/bin/pipe0-nersc-verifier.sh @@ -15,6 +15,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="completed"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="TRUE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-rate-limiter.sh b/bin/pipe0-rate-limiter.sh index 696018f..8d96974 100755 --- a/bin/pipe0-rate-limiter.sh +++ b/bin/pipe0-rate-limiter.sh @@ -12,6 +12,7 @@ export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} export OUTPUT_PATH=${OUTPUT_PATH:="/data/user/jadelta/ltatemp/bundler_out"} export OUTPUT_QUOTA=${OUTPUT_QUOTA:="12094627905536"} # 11 TiB export OUTPUT_STATUS=${OUTPUT_STATUS:="staged"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe0-site-move-verifier.sh b/bin/pipe0-site-move-verifier.sh index b5552a7..5b88488 100755 --- a/bin/pipe0-site-move-verifier.sh +++ b/bin/pipe0-site-move-verifier.sh @@ -12,6 +12,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="taping"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="TRUE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe1-deleter.sh b/bin/pipe1-deleter.sh index 566338d..57b6f50 100755 --- a/bin/pipe1-deleter.sh +++ b/bin/pipe1-deleter.sh @@ -10,6 +10,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="deleted"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="NERSC"} diff --git a/bin/pipe1-gridftp-replicator.sh b/bin/pipe1-gridftp-replicator.sh index 000a965..f67480f 100755 --- a/bin/pipe1-gridftp-replicator.sh +++ b/bin/pipe1-gridftp-replicator.sh @@ -17,6 +17,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="transferring"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python" export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} diff --git a/bin/pipe1-site-move-verifier.sh b/bin/pipe1-site-move-verifier.sh index 8f4919c..900f688 100644 --- a/bin/pipe1-site-move-verifier.sh +++ b/bin/pipe1-site-move-verifier.sh @@ -10,6 +10,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="unpacking"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="NERSC"} diff --git a/bin/pipe1-transfer-request-finisher.sh b/bin/pipe1-transfer-request-finisher.sh index af730aa..a715083 100755 --- a/bin/pipe1-transfer-request-finisher.sh +++ b/bin/pipe1-transfer-request-finisher.sh @@ -9,6 +9,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="finished"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="NERSC"} diff --git a/bin/pipe1-unpacker.sh b/bin/pipe1-unpacker.sh index ca9f906..26dfe40 100755 --- a/bin/pipe1-unpacker.sh +++ b/bin/pipe1-unpacker.sh @@ -13,6 +13,7 @@ export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="completed"} export PATH_MAP_JSON=${PATH_MAP_JSON:="path_map.json"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="FALSE"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="NERSC"} diff --git a/bin/pipe2-bundler.sh b/bin/pipe2-bundler.sh index 24a0780..9c2a304 100755 --- a/bin/pipe2-bundler.sh +++ b/bin/pipe2-bundler.sh @@ -14,6 +14,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="created"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe2-deleter.sh b/bin/pipe2-deleter.sh index 374a7c4..8b2d4c1 100755 --- a/bin/pipe2-deleter.sh +++ b/bin/pipe2-deleter.sh @@ -10,6 +10,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="deleted"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe2-desy-site-move-verifier.sh b/bin/pipe2-desy-site-move-verifier.sh index 22a710c..6e86d5f 100644 --- a/bin/pipe2-desy-site-move-verifier.sh +++ b/bin/pipe2-desy-site-move-verifier.sh @@ -12,6 +12,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="completed"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe2-desy-verifier.sh b/bin/pipe2-desy-verifier.sh index 3d251aa..38c0fd1 100755 --- a/bin/pipe2-desy-verifier.sh +++ b/bin/pipe2-desy-verifier.sh @@ -12,6 +12,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="completed"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe2-gridftp-replicator.sh b/bin/pipe2-gridftp-replicator.sh index 51e3237..b4cd92a 100755 --- a/bin/pipe2-gridftp-replicator.sh +++ b/bin/pipe2-gridftp-replicator.sh @@ -17,6 +17,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="transferring"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe2-rate-limiter.sh b/bin/pipe2-rate-limiter.sh index 4c76df6..4c1fa24 100755 --- a/bin/pipe2-rate-limiter.sh +++ b/bin/pipe2-rate-limiter.sh @@ -12,6 +12,7 @@ export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} export OUTPUT_PATH=${OUTPUT_PATH:="/mnt/lfss/jade-lta/bundler_todesy"} export OUTPUT_QUOTA=${OUTPUT_QUOTA:="2199023255552"} # 2 TiB export OUTPUT_STATUS=${OUTPUT_STATUS:="staged"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/pipe2-site-move-verifier.sh b/bin/pipe2-site-move-verifier.sh index 72992e5..8507dd9 100755 --- a/bin/pipe2-site-move-verifier.sh +++ b/bin/pipe2-site-move-verifier.sh @@ -16,6 +16,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="https://lta.icecube.aq:443"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="verifying"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/rate-limiter.sh b/bin/rate-limiter.sh index 2ec85ca..c33549c 100755 --- a/bin/rate-limiter.sh +++ b/bin/rate-limiter.sh @@ -12,6 +12,7 @@ export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} export OUTPUT_PATH=${OUTPUT_PATH:="/mnt/lfss/jade-lta/bundler_out"} export OUTPUT_QUOTA=${OUTPUT_QUOTA:="12094627905536"} # 11 TiB export OUTPUT_STATUS=${OUTPUT_STATUS:="staged"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/rest-server.sh b/bin/rest-server.sh index cec14f7..a54ded9 100755 --- a/bin/rest-server.sh +++ b/bin/rest-server.sh @@ -10,5 +10,6 @@ export LTA_MONGODB_PORT=${LTA_MONGODB_PORT:="27017"} export LTA_REST_HOST=${LTA_REST_HOST:="127.0.0.1"} export LTA_REST_PORT=${LTA_REST_PORT:="8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8090"} export WIPACTEL_EXPORT_STDOUT=${WIPACTEL_EXPORT_STDOUT:="FALSE"} python -m lta.rest_server diff --git a/bin/transfer-request-finisher.sh b/bin/transfer-request-finisher.sh index 6f7d5e6..83e1e2b 100755 --- a/bin/transfer-request-finisher.sh +++ b/bin/transfer-request-finisher.sh @@ -9,6 +9,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="finished"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="False"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="WIPAC"} diff --git a/bin/unpacker.sh b/bin/unpacker.sh index 781464d..0057d35 100755 --- a/bin/unpacker.sh +++ b/bin/unpacker.sh @@ -12,6 +12,7 @@ export LTA_AUTH_OPENID_URL=${LTA_AUTH_OPENID_URL:="https://keycloak.icecube.wisc export LTA_REST_URL=${LTA_REST_URL:="http://127.0.0.1:8080"} # export OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:="https://telemetry.dev.icecube.aq/v1/traces"} export OUTPUT_STATUS=${OUTPUT_STATUS:="completed"} +export PROMETHEUS_METRICS_PORT=${PROMETHEUS_METRICS_PORT:="8080"} export RUN_ONCE_AND_DIE=${RUN_ONCE_AND_DIE:="True"} export RUN_UNTIL_NO_WORK=${RUN_UNTIL_NO_WORK:="FALSE"} export SOURCE_SITE=${SOURCE_SITE:="NERSC"} diff --git a/lta/bundler.py b/lta/bundler.py index a37b2be..fa4da31 100644 --- a/lta/bundler.py +++ b/lta/bundler.py @@ -11,12 +11,13 @@ from typing import Any, Dict, Optional from zipfile import ZIP_STORED, ZipFile +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop from .crypto import lta_checksums +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -37,6 +38,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class Bundler(Component): """ @@ -77,12 +83,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='bundler', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -114,7 +123,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._do_work_bundle(fc_rc, lta_rc, bundle) + success_counter.labels(component='bundler', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='bundler', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") raise e # signal the work was processed successfully @@ -297,12 +308,19 @@ async def _quarantine_bundle(self, self.logger.error(f'Unable to quarantine Bundle {bundle["uuid"]}: {e}.') -def runner() -> None: +async def main(bundler: Bundler) -> None: + """Execute the work loop of the Bundler component.""" + LOG.info("Starting asynchronous code") + await work_loop(bundler) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a Bundler component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -310,18 +328,14 @@ def runner() -> None: style="{", ) # create our Bundler service - bundler = Bundler(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + bundler = Bundler(config, LOG) # let's get to work - bundler.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(bundler)) - - -def main() -> None: - """Configure a Bundler component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(bundler)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/component.py b/lta/component.py index 23fc16a..0b733ec 100644 --- a/lta/component.py +++ b/lta/component.py @@ -25,6 +25,7 @@ "LTA_AUTH_OPENID_URL": None, "LTA_REST_URL": None, "OUTPUT_STATUS": None, + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": None, diff --git a/lta/deleter.py b/lta/deleter.py index f2c3730..98f534e 100644 --- a/lta/deleter.py +++ b/lta/deleter.py @@ -7,11 +7,12 @@ import sys from typing import Any, Dict, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -25,6 +26,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class Deleter(Component): """ @@ -60,12 +66,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='deleter', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -92,7 +101,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._delete_bundle(lta_rc, bundle) + success_counter.labels(component='deleter', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='deleter', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") raise e # if we were successful at processing work, let the caller know @@ -140,12 +151,19 @@ async def _quarantine_bundle(self, self.logger.error(f'Unable to quarantine Bundle {bundle["uuid"]}: {e}.') -def runner() -> None: +async def main(deleter: Deleter) -> None: + """Execute the work loop of the Deleter component.""" + LOG.info("Starting asynchronous code") + await work_loop(deleter) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a Deleter component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -153,18 +171,14 @@ def runner() -> None: style="{", ) # create our Deleter service - deleter = Deleter(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + deleter = Deleter(config, LOG) # let's get to work - deleter.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(deleter)) - - -def main() -> None: - """Configure a Deleter component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(deleter)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/desy_move_verifier.py b/lta/desy_move_verifier.py index 98d2595..ab9c6a8 100644 --- a/lta/desy_move_verifier.py +++ b/lta/desy_move_verifier.py @@ -7,13 +7,14 @@ import sys from typing import Any, Dict, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop from .crypto import sha512sum from .joiner import join_smart, join_smart_url +from .lta_tools import from_environment from .lta_types import BundleType from .transfer.globus import SiteGlobusProxy from .transfer.gridftp import GridFTP @@ -31,6 +32,11 @@ "WORKBOX_PATH": None, }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class DesyMoveVerifier(Component): """ @@ -67,12 +73,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='desy_move_verifier', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -99,7 +108,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._verify_bundle(lta_rc, bundle) + success_counter.labels(component='desy_move_verifier', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='desy_move_verifier', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") raise e # if we were successful at processing work, let the caller know @@ -179,12 +190,19 @@ async def _verify_bundle(self, lta_rc: RestClient, bundle: BundleType) -> bool: return True -def runner() -> None: +async def main(desy_move_verifier: DesyMoveVerifier) -> None: + """Execute the work loop of the DesyMoveVerifier component.""" + LOG.info("Starting asynchronous code") + await work_loop(desy_move_verifier) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a DesyMoveVerifier component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -192,18 +210,14 @@ def runner() -> None: style="{", ) # create our DesyMoveVerifier service - desy_move_verifier = DesyMoveVerifier(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + desy_move_verifier = DesyMoveVerifier(config, LOG) # let's get to work - desy_move_verifier.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(desy_move_verifier)) - - -def main() -> None: - """Configure a DesyMoveVerifier component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(desy_move_verifier)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/desy_verifier.py b/lta/desy_verifier.py index 021663b..dcbb05d 100644 --- a/lta/desy_verifier.py +++ b/lta/desy_verifier.py @@ -7,12 +7,13 @@ import sys from typing import Any, Dict, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop from .joiner import join_smart +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -32,6 +33,11 @@ # maximum number of Metadata UUIDs to work with at a time UPDATE_CHUNK_SIZE = 1000 +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class DesyVerifier(Component): """ @@ -70,12 +76,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='desy_verifier', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -103,8 +112,10 @@ async def _do_work_claim(self) -> bool: try: await self._add_bundle_to_file_catalog(lta_rc, bundle) await self._update_bundle_in_lta_db(lta_rc, bundle) + success_counter.labels(component='desy_verifier', level='bundle', type='work').inc() return True except Exception as e: + failure_counter.labels(component='desy_verifier', level='bundle', type='exception').inc() bundle_id = bundle["uuid"] right_now = now() patch_body = { @@ -235,12 +246,19 @@ async def _update_files_in_file_catalog(self, return True -def runner() -> None: +async def main(desy_verifier: DesyVerifier) -> None: + """Execute the work loop of the DesyVerifier component.""" + LOG.info("Starting asynchronous code") + await work_loop(desy_verifier) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a DesyVerifier component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -248,18 +266,14 @@ def runner() -> None: style="{", ) # create our DesyVerifier service - desy_verifier = DesyVerifier(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + desy_verifier = DesyVerifier(config, LOG) # let's get to work - desy_verifier.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(desy_verifier)) - - -def main() -> None: - """Configure a DesyVerifier component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(desy_verifier)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/gridftp_replicator.py b/lta/gridftp_replicator.py index d6be8fe..482c356 100644 --- a/lta/gridftp_replicator.py +++ b/lta/gridftp_replicator.py @@ -8,12 +8,13 @@ import sys from typing import Any, Dict, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop from .joiner import join_smart_url +from .lta_tools import from_environment from .lta_types import BundleType from .rest_server import boolify from .transfer.globus import SiteGlobusProxy @@ -38,6 +39,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class GridFTPReplicator(Component): """ @@ -79,12 +85,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='gridftp_replicator', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -111,7 +120,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._replicate_bundle_to_destination_site(lta_rc, bundle) + success_counter.labels(component='gridftp_replicator', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='gridftp_replicator', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") return False # if we were successful at processing work, let the caller know @@ -173,12 +184,19 @@ async def _replicate_bundle_to_destination_site(self, lta_rc: RestClient, bundle await lta_rc.request('PATCH', f'/Bundles/{bundle_id}', patch_body) -def runner() -> None: +async def main(gridftp_replicator: GridFTPReplicator) -> None: + """Execute the work loop of the GridFTPReplicator component.""" + LOG.info("Starting asynchronous code") + await work_loop(gridftp_replicator) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a GridFTPReplicator component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -186,18 +204,14 @@ def runner() -> None: style="{", ) # create our GridFTPReplicator service - replicator = GridFTPReplicator(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + gridftp_replicator = GridFTPReplicator(config, LOG) # let's get to work - replicator.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(replicator)) - - -def main() -> None: - """Configure a GridFTPReplicator component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(gridftp_replicator)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/locator.py b/lta/locator.py index 83d1b90..764ca48 100644 --- a/lta/locator.py +++ b/lta/locator.py @@ -8,11 +8,12 @@ import sys from typing import Any, Dict, List, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType, TransferRequestType Logger = logging.Logger @@ -29,6 +30,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + def as_lta_record(catalog_record: Dict[str, Any]) -> Dict[str, Any]: """Cherry pick keys from a File Catalog record to include in Bundle metadata.""" @@ -90,12 +96,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on TransferRequests.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='locator', level='transfer_request', type='work').set(load_level) self.logger.info("Ending work on TransferRequests.") @wtt.spanned() @@ -122,7 +131,9 @@ async def _do_work_claim(self) -> bool: # process the TransferRequest that we were given try: await self._do_work_transfer_request(lta_rc, tr) + success_counter.labels(component='locator', level='transfer_request', type='work').inc() except Exception as e: + failure_counter.labels(component='locator', level='transfer_request', type='exception').inc() await self._quarantine_transfer_request(lta_rc, tr, f"{e}") raise e # if we were successful at processing work, let the caller know @@ -278,12 +289,19 @@ def _reduce_unique_archive_uuid(self, return bundle_uuids -def runner() -> None: +async def main(locator: Locator) -> None: + """Execute the work loop of the Locator component.""" + LOG.info("Starting asynchronous code") + await work_loop(locator) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a Locator component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -291,18 +309,14 @@ def runner() -> None: style="{", ) # create our Locator service - locator = Locator(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + locator = Locator(config, LOG) # let's get to work - locator.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(locator)) - - -def main() -> None: - """Configure a Locator component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(locator)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/nersc_mover.py b/lta/nersc_mover.py index 05aa13d..c5eaf92 100644 --- a/lta/nersc_mover.py +++ b/lta/nersc_mover.py @@ -8,11 +8,12 @@ import sys from typing import Any, Dict, List, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -29,6 +30,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class NerscMover(Component): """ @@ -77,12 +83,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='nersc_mover', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -117,8 +126,10 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._write_bundle_to_hpss(lta_rc, bundle) + success_counter.labels(component='nersc_mover', level='bundle', type='work').inc() return True except Exception as e: + failure_counter.labels(component='nersc_mover', level='bundle', type='exception').inc() bundle_id = bundle["uuid"] right_now = now() patch_body = { @@ -193,12 +204,19 @@ async def _execute_hsi_command(self, lta_rc: RestClient, bundle: BundleType, arg return True -def runner() -> None: +async def main(nersc_mover: NerscMover) -> None: + """Execute the work loop of the NerscMover component.""" + LOG.info("Starting asynchronous code") + await work_loop(nersc_mover) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a NerscMover component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -206,18 +224,14 @@ def runner() -> None: style="{", ) # create our NerscMover service - nersc_mover = NerscMover(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + nersc_mover = NerscMover(config, LOG) # let's get to work - nersc_mover.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(nersc_mover)) - - -def main() -> None: - """Configure a NerscMover component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(nersc_mover)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/nersc_retriever.py b/lta/nersc_retriever.py index 2777388..bace599 100644 --- a/lta/nersc_retriever.py +++ b/lta/nersc_retriever.py @@ -8,11 +8,12 @@ import sys from typing import Any, Dict, List, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -28,6 +29,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class NerscRetriever(Component): """ @@ -75,12 +81,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='nersc_retriever', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -115,8 +124,10 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._read_bundle_from_hpss(lta_rc, bundle) + success_counter.labels(component='nersc_retriever', level='bundle', type='work').inc() return True except Exception as e: + failure_counter.labels(component='nersc_retriever', level='bundle', type='exception').inc() bundle_id = bundle["uuid"] right_now = now() patch_body = { @@ -183,12 +194,19 @@ async def _execute_hsi_command(self, lta_rc: RestClient, bundle: BundleType, arg return True -def runner() -> None: +async def main(nersc_retriever: NerscRetriever) -> None: + """Execute the work loop of the NerscRetriever component.""" + LOG.info("Starting asynchronous code") + await work_loop(nersc_retriever) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a NerscRetriever component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -196,18 +214,14 @@ def runner() -> None: style="{", ) # create our NerscRetriever service - nersc_retriever = NerscRetriever(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + nersc_retriever = NerscRetriever(config, LOG) # let's get to work - nersc_retriever.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(nersc_retriever)) - - -def main() -> None: - """Configure a NerscRetriever component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(nersc_retriever)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/nersc_verifier.py b/lta/nersc_verifier.py index ad8c1e4..46652bd 100644 --- a/lta/nersc_verifier.py +++ b/lta/nersc_verifier.py @@ -8,11 +8,12 @@ import sys from typing import Any, Dict, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -33,6 +34,11 @@ # maximum number of Metadata UUIDs to work with at a time UPDATE_CHUNK_SIZE = 1000 +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class NerscVerifier(Component): """ @@ -82,12 +88,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='nersc_verifier', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -124,8 +133,10 @@ async def _do_work_claim(self) -> bool: if await self._verify_bundle_in_hpss(lta_rc, bundle): await self._add_bundle_to_file_catalog(lta_rc, bundle) await self._update_bundle_in_lta_db(lta_rc, bundle) + success_counter.labels(component='nersc_verifier', level='bundle', type='work').inc() return True except Exception as e: + failure_counter.labels(component='nersc_verifier', level='bundle', type='exception').inc() bundle_uuid = bundle["uuid"] right_now = now() patch_body = { @@ -368,12 +379,19 @@ async def _verify_bundle_in_hpss(self, lta_rc: RestClient, bundle: BundleType) - return True -def runner() -> None: +async def main(nersc_verifier: NerscVerifier) -> None: + """Execute the work loop of the NerscVerifier component.""" + LOG.info("Starting asynchronous code") + await work_loop(nersc_verifier) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a NerscVerifier component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -381,18 +399,14 @@ def runner() -> None: style="{", ) # create our NerscVerifier service - nersc_verifier = NerscVerifier(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + nersc_verifier = NerscVerifier(config, LOG) # let's get to work - nersc_verifier.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(nersc_verifier)) - - -def main() -> None: - """Configure a NerscVerifier component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(nersc_verifier)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/picker.py b/lta/picker.py index 2900891..7a43986 100644 --- a/lta/picker.py +++ b/lta/picker.py @@ -8,11 +8,12 @@ from typing import Any, Dict, List, Optional, Tuple from binpacking import to_constant_volume # type: ignore +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType, TransferRequestType Logger = logging.Logger @@ -33,6 +34,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class Picker(Component): """ @@ -72,12 +78,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on TransferRequests.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='picker', level='transfer_request', type='work').set(load_level) self.logger.info("Ending work on TransferRequests.") @wtt.spanned() @@ -104,7 +113,9 @@ async def _do_work_claim(self) -> bool: # process the TransferRequest that we were given try: await self._do_work_transfer_request(lta_rc, tr) + success_counter.labels(component='picker', level='transfer_request', type='work').inc() except Exception as e: + failure_counter.labels(component='picker', level='transfer_request', type='exception').inc() self.logger.info(f"There was an error while processing the transfer request: {e}") self.logger.info("Will now attempt to send the transfer request to 'quarantined' status.") await self._quarantine_transfer_request(lta_rc, tr, f"{e}") @@ -239,12 +250,19 @@ async def _quarantine_transfer_request(self, self.logger.error(f'Unable to quarantine TransferRequest {tr["uuid"]}: {e}.') -def runner() -> None: +async def main(picker: Picker) -> None: + """Execute the work loop of the Picker component.""" + LOG.info("Starting asynchronous code") + await work_loop(picker) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a Picker component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -252,18 +270,14 @@ def runner() -> None: style="{", ) # create our Picker service - picker = Picker(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + picker = Picker(config, LOG) # let's get to work - picker.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(picker)) - - -def main() -> None: - """Configure a Picker component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(picker)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/rate_limiter.py b/lta/rate_limiter.py index caf9cc2..40eb470 100644 --- a/lta/rate_limiter.py +++ b/lta/rate_limiter.py @@ -8,11 +8,12 @@ import sys from typing import Any, Dict, List, Optional, Tuple +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -28,6 +29,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class RateLimiter(Component): """ @@ -95,12 +101,15 @@ def _get_files_and_size(self, path: str) -> Tuple[List[str], int]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='rate_limiter', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -127,7 +136,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._stage_bundle(lta_rc, bundle) + success_counter.labels(component='rate_limiter', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='rate_limiter', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") raise e # even if we were successful, take a break between bundles @@ -207,12 +218,19 @@ async def _unclaim_bundle(self, lta_rc: RestClient, bundle: BundleType) -> bool: return True -def runner() -> None: +async def main(rate_limiter: RateLimiter) -> None: + """Execute the work loop of the RateLimiter component.""" + LOG.info("Starting asynchronous code") + await work_loop(rate_limiter) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a RateLimiter component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -220,18 +238,14 @@ def runner() -> None: style="{", ) # create our RateLimiter service - rate_limiter = RateLimiter(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + rate_limiter = RateLimiter(config, LOG) # let's get to work - rate_limiter.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(rate_limiter)) - - -def main() -> None: - """Configure a RateLimiter component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(rate_limiter)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/rest_server.py b/lta/rest_server.py index 2abca5b..6ec1ea1 100644 --- a/lta/rest_server.py +++ b/lta/rest_server.py @@ -14,6 +14,7 @@ from uuid import uuid1 from motor.motor_tornado import MotorClient, MotorDatabase # type: ignore +from prometheus_client import Counter, start_http_server import pymongo from pymongo import MongoClient from rest_tools.utils.json_util import json_decode @@ -37,8 +38,11 @@ 'LTA_MONGODB_PORT': '27017', 'LTA_REST_HOST': 'localhost', 'LTA_REST_PORT': '8080', + 'PROMETHEUS_METRICS_PORT': '8090', } +LOG = logging.getLogger(__name__) + # ----------------------------------------------------------------------------- AFTER = pymongo.ReturnDocument.AFTER @@ -53,6 +57,12 @@ # ----------------------------------------------------------------------------- +# prometheus metrics +request_counter = Counter('lta_requests', 'LTA DB requests', ['method', 'route']) +response_counter = Counter('lta_responses', 'LTA DB responses', ['method', 'response', 'route']) + +# ----------------------------------------------------------------------------- + if bool(os.environ.get('CI_TEST_ENV', False)): def lta_auth(**_auth: Any) -> Callable[..., Any]: def make_wrapper(method: Callable[..., Any]) -> Any: @@ -112,12 +122,16 @@ class BundlesActionsBulkCreateHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /Bundles/actions/bulk_create.""" + request_counter.labels(method='POST', route='/Bundles/actions/bulk_create').inc() req = json_decode(self.request.body) if 'bundles' not in req: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_create').inc() raise tornado.web.HTTPError(400, reason="missing bundles field") if not isinstance(req['bundles'], list): + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_create').inc() raise tornado.web.HTTPError(400, reason="bundles field is not a list") if not req['bundles']: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_create').inc() raise tornado.web.HTTPError(400, reason="bundles field is empty") for xfer_bundle in req["bundles"]: @@ -141,6 +155,7 @@ async def post(self) -> None: self.set_status(201) self.write({'bundles': uuids, 'count': create_count}) + response_counter.labels(method='POST', response='201', route='/Bundles/actions/bulk_create').inc() class BundlesActionsBulkDeleteHandler(BaseLTAHandler): @@ -149,12 +164,16 @@ class BundlesActionsBulkDeleteHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /Bundles/actions/bulk_delete.""" + request_counter.labels(method='POST', route='/Bundles/actions/bulk_delete').inc() req = json_decode(self.request.body) if 'bundles' not in req: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_delete').inc() raise tornado.web.HTTPError(400, reason="missing bundles field") if not isinstance(req['bundles'], list): + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_delete').inc() raise tornado.web.HTTPError(400, reason="bundles field is not a list") if not req['bundles']: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_delete').inc() raise tornado.web.HTTPError(400, reason="bundles field is empty") results = [] @@ -168,6 +187,7 @@ async def post(self) -> None: results.append(uuid) self.write({'bundles': results, 'count': len(results)}) + response_counter.labels(method='POST', response='200', route='/Bundles/actions/bulk_delete').inc() class BundlesActionsBulkUpdateHandler(BaseLTAHandler): @@ -176,16 +196,22 @@ class BundlesActionsBulkUpdateHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /Bundles/actions/bulk_update.""" + request_counter.labels(method='POST', route='/Bundles/actions/bulk_update').inc() req = json_decode(self.request.body) if 'update' not in req: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_update').inc() raise tornado.web.HTTPError(400, reason="missing update field") if not isinstance(req['update'], dict): + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_update').inc() raise tornado.web.HTTPError(400, reason="update field is not an object") if 'bundles' not in req: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_update').inc() raise tornado.web.HTTPError(400, reason="missing bundles field") if not isinstance(req['bundles'], list): + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_update').inc() raise tornado.web.HTTPError(400, reason="bundles field is not a list") if not req['bundles']: + response_counter.labels(method='POST', response='400', route='/Bundles/actions/bulk_update').inc() raise tornado.web.HTTPError(400, reason="bundles field is empty") results = [] @@ -200,6 +226,7 @@ async def post(self) -> None: results.append(uuid) self.write({'bundles': results, 'count': len(results)}) + response_counter.labels(method='POST', response='200', route='/Bundles/actions/bulk_update').inc() class BundlesHandler(BaseLTAHandler): @@ -208,6 +235,7 @@ class BundlesHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def get(self) -> None: """Handle GET /Bundles.""" + request_counter.labels(method='GET', route='/Bundles').inc() location = self.get_query_argument("location", default=None) request = self.get_query_argument("request", default=None) status = self.get_query_argument("status", default=None) @@ -241,6 +269,7 @@ async def get(self) -> None: 'results': results, } self.write(ret) + response_counter.labels(method='GET', response='200', route='/Bundles').inc() class BundlesActionsPopHandler(BaseLTAHandler): @@ -249,13 +278,16 @@ class BundlesActionsPopHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /Bundles/actions/pop.""" + request_counter.labels(method='POST', route='/Bundles/actions/pop').inc() dest: Optional[str] = self.get_argument('dest', default=None) source: Optional[str] = self.get_argument('source', default=None) status: str = self.get_argument('status') if (not dest) and (not source): + response_counter.labels(method='GET', response='400', route='/Bundles/actions/pop').inc() raise tornado.web.HTTPError(400, reason="missing source and dest fields") pop_body = json_decode(self.request.body) if 'claimant' not in pop_body: + response_counter.labels(method='GET', response='400', route='/Bundles/actions/pop').inc() raise tornado.web.HTTPError(400, reason="missing claimant field") claimant = pop_body["claimant"] # find and claim a bundle for the specified source @@ -290,6 +322,7 @@ async def post(self) -> None: else: logging.info(f"Bundle {bundle['uuid']} claimed by {claimant}") self.write({'bundle': bundle}) + response_counter.labels(method='GET', response='200', route='/Bundles/actions/pop').inc() class BundlesSingleHandler(BaseLTAHandler): @@ -298,6 +331,7 @@ class BundlesSingleHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def get(self, bundle_id: str) -> None: """Handle GET /Bundles/{uuid}.""" + request_counter.labels(method='GET', route='/Bundles/{uuid}').inc() query = {"uuid": bundle_id} projection = { "_id": False, @@ -307,14 +341,18 @@ async def get(self, bundle_id: str) -> None: ret = await self.db.Bundles.find_one(filter=query, projection=projection) logging.debug("MONGO-END: db.Bundles.find_one(filter, projection)") if not ret: + response_counter.labels(method='GET', response='404', route='/Bundles/{uuid}').inc() raise tornado.web.HTTPError(404, reason="not found") self.write(ret) + response_counter.labels(method='GET', response='200', route='/Bundles/{uuid}').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def patch(self, bundle_id: str) -> None: """Handle PATCH /Bundles/{uuid}.""" + request_counter.labels(method='PATCH', route='/Bundles/{uuid}').inc() req = json_decode(self.request.body) if 'uuid' in req and req['uuid'] != bundle_id: + response_counter.labels(method='PATCH', response='400', route='/Bundles/{uuid}').inc() raise tornado.web.HTTPError(400, reason="bad request") query = {"uuid": bundle_id} update_doc = {"$set": req} @@ -325,19 +363,23 @@ async def patch(self, bundle_id: str) -> None: return_document=AFTER) logging.debug("MONGO-END: db.Bundles.find_one_and_update(filter, update, projection, return_document)") if not ret: + response_counter.labels(method='PATCH', response='404', route='/Bundles/{uuid}').inc() raise tornado.web.HTTPError(404, reason="not found") logging.info(f"patched Bundle {bundle_id} with {req}") self.write(ret) + response_counter.labels(method='PATCH', response='200', route='/Bundles/{uuid}').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def delete(self, bundle_id: str) -> None: """Handle DELETE /Bundles/{uuid}.""" + request_counter.labels(method='DELETE', route='/Bundles/{uuid}').inc() query = {"uuid": bundle_id} logging.debug(f"MONGO-START: db.Bundles.delete_one(filter={query})") await self.db.Bundles.delete_one(filter=query) logging.debug("MONGO-END: db.Bundles.delete_one(filter)") logging.info(f"deleted Bundle {bundle_id}") self.set_status(204) + response_counter.labels(method='DELETE', response='204', route='/Bundles/{uuid}').inc() # ----------------------------------------------------------------------------- @@ -347,7 +389,9 @@ class MainHandler(BaseLTAHandler): def get(self) -> None: """Handle GET /.""" + request_counter.labels(method='GET', route='/').inc() self.write({}) + response_counter.labels(method='GET', response='200', route='/').inc() # ----------------------------------------------------------------------------- @@ -358,6 +402,7 @@ class MetadataActionsBulkCreateHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /Metadata/actions/bulk_create.""" + request_counter.labels(method='POST', route='/Metadata/actions/bulk_create').inc() bundle_uuid = self.get_argument("bundle_uuid", type=str) files = self.get_argument("files", type=list, forbiddens=[[]]) @@ -382,6 +427,7 @@ async def post(self) -> None: self.set_status(201) self.write({'metadata': uuids, 'count': create_count}) + response_counter.labels(method='POST', response='201', route='/Metadata/actions/bulk_create').inc() class MetadataActionsBulkDeleteHandler(BaseLTAHandler): @@ -390,6 +436,7 @@ class MetadataActionsBulkDeleteHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /Metadata/actions/bulk_delete.""" + request_counter.labels(method='POST', route='/Metadata/actions/bulk_delete').inc() metadata = self.get_argument("metadata", type=list, forbiddens=[[]]) count = 0 @@ -405,6 +452,7 @@ async def post(self) -> None: count = count + ret.deleted_count self.write({'metadata': metadata, 'count': count}) + response_counter.labels(method='POST', response='200', route='/Metadata/actions/bulk_delete').inc() class MetadataHandler(BaseLTAHandler): @@ -413,6 +461,7 @@ class MetadataHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def get(self) -> None: """Handle GET /Metadata.""" + request_counter.labels(method='GET', route='/Metadata').inc() bundle_uuid = self.get_query_argument("bundle_uuid", default=None) limit = int(cast(str, self.get_query_argument("limit", default="1000"))) skip = int(cast(str, self.get_query_argument("skip", default="0"))) @@ -436,10 +485,12 @@ async def get(self) -> None: 'results': results, } self.write(ret) + response_counter.labels(method='GET', response='200', route='/Metadata').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def delete(self) -> None: """Handle DELETE /Metadata?bundle_uuid={uuid}.""" + request_counter.labels(method='DELETE', route='/Metadata?bundle_uuid={uuid}').inc() bundle_uuid = self.get_argument("bundle_uuid", type=str) query = {"bundle_uuid": bundle_uuid} logging.debug(f"MONGO-START: db.Metadata.delete_many(filter={query})") @@ -447,6 +498,7 @@ async def delete(self) -> None: logging.debug("MONGO-END: db.Metadata.delete_many(filter)") logging.info(f"deleted all Metadata records for Bundle {bundle_uuid}") self.set_status(204) + response_counter.labels(method='DELETE', response='204', route='/Metadata?bundle_uuid={uuid}').inc() class MetadataSingleHandler(BaseLTAHandler): @@ -455,24 +507,29 @@ class MetadataSingleHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def get(self, metadata_id: str) -> None: """Handle GET /Metadata/{uuid}.""" + request_counter.labels(method='GET', route='/Metadata/{uuid}').inc() query = {"uuid": metadata_id} projection = {"_id": False} logging.debug(f"MONGO-START: db.Metadata.find_one(filter={query}, projection={projection})") ret = await self.db.Metadata.find_one(filter=query, projection=projection) logging.debug("MONGO-END: db.Metadata.find_one(filter, projection)") if not ret: + response_counter.labels(method='GET', response='404', route='/Metadata/{uuid}').inc() raise tornado.web.HTTPError(404, reason="not found") self.write(ret) + response_counter.labels(method='GET', response='200', route='/Metadata/{uuid}').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def delete(self, metadata_id: str) -> None: """Handle DELETE /Metadata/{uuid}.""" + request_counter.labels(method='DELETE', route='/Metadata/{uuid}').inc() query = {"uuid": metadata_id} logging.debug(f"MONGO-START: db.Metadata.delete_one(filter={query})") await self.db.Metadata.delete_one(filter=query) logging.debug("MONGO-END: db.Metadata.delete_one(filter)") logging.info(f"deleted Bundle {metadata_id}") self.set_status(204) + response_counter.labels(method='DELETE', response='204', route='/Metadata/{uuid}').inc() # ----------------------------------------------------------------------------- @@ -483,6 +540,7 @@ class TransferRequestsHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def get(self) -> None: """Handle GET /TransferRequests.""" + request_counter.labels(method='GET', route='/TransferRequests').inc() ret = [] logging.debug(f"MONGO-START: db.TransferRequests.find(filter={ALL_DOCUMENTS}, projection={REMOVE_ID})") async for row in self.db.TransferRequests.find(filter=ALL_DOCUMENTS, @@ -490,28 +548,39 @@ async def get(self) -> None: ret.append(row) logging.debug("MONGO-END*: db.TransferRequests.find(filter, projection)") self.write({'results': ret}) + response_counter.labels(method='GET', response='200', route='/TransferRequests').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /TransferRequests.""" + request_counter.labels(method='POST', route='/TransferRequests').inc() req = json_decode(self.request.body) if 'source' not in req: + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="missing source field") if 'dest' not in req: + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="missing dest field") if 'path' not in req: + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="missing path field") if not isinstance(req['source'], str): + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="source field is not a string") if not isinstance(req['dest'], str): + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="dest field is not a string") if not isinstance(req['path'], str): + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="path field is not a string") if not req['source']: + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="source field is empty") if not req['dest']: + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="dest field is empty") if not req['path']: + response_counter.labels(method='POST', response='400', route='/TransferRequests').inc() raise tornado.web.HTTPError(400, reason="path field is empty") right_now = now() # https://www.youtube.com/watch?v=He0p5I0b8j8 @@ -529,6 +598,7 @@ async def post(self) -> None: logging.info(f"created TransferRequest {req['uuid']}") self.set_status(201) self.write({'TransferRequest': req['uuid']}) + response_counter.labels(method='POST', response='201', route='/TransferRequests').inc() class TransferRequestSingleHandler(BaseLTAHandler): @@ -537,19 +607,24 @@ class TransferRequestSingleHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def get(self, request_id: str) -> None: """Handle GET /TransferRequests/{uuid}.""" + request_counter.labels(method='GET', route='/TransferRequests/{uuid}').inc() query = {'uuid': request_id} logging.debug(f"MONGO-START: db.TransferRequests.find_one(filter={query}, projection={REMOVE_ID}") ret = await self.db.TransferRequests.find_one(filter=query, projection=REMOVE_ID) logging.debug("MONGO-END: db.TransferRequests.find_one(filter, projection)") if not ret: + response_counter.labels(method='GET', response='404', route='/TransferRequests/{uuid}').inc() raise tornado.web.HTTPError(404, reason="not found") self.write(ret) + response_counter.labels(method='GET', response='200', route='/TransferRequests/{uuid}').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def patch(self, request_id: str) -> None: """Handle PATCH /TransferRequests/{uuid}.""" + request_counter.labels(method='PATCH', route='/TransferRequests/{uuid}').inc() req = json_decode(self.request.body) if 'uuid' in req and req['uuid'] != request_id: + response_counter.labels(method='PATCH', response='400', route='/TransferRequests/{uuid}').inc() raise tornado.web.HTTPError(400, reason="bad request") sbtr = self.db.TransferRequests query = {"uuid": request_id} @@ -561,19 +636,23 @@ async def patch(self, request_id: str) -> None: return_document=AFTER) logging.debug("MONGO-END: db.TransferRequests.find_one_and_update(filter, update, projection, return_document") if not ret: + response_counter.labels(method='PATCH', response='404', route='/TransferRequests/{uuid}').inc() raise tornado.web.HTTPError(404, reason="not found") logging.info(f"patched TransferRequest {request_id} with {req}") self.write({}) + response_counter.labels(method='PATCH', response='200', route='/TransferRequests/{uuid}').inc() @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def delete(self, request_id: str) -> None: """Handle DELETE /TransferRequests/{uuid}.""" + request_counter.labels(method='DELETE', route='/TransferRequests/{uuid}').inc() query = {"uuid": request_id} logging.debug(f"MONGO-START: db.TransferRequests.delete_one(filter={query})") await self.db.TransferRequests.delete_one(filter=query) logging.debug("MONGO-END: db.TransferRequests.delete_one(filter)") logging.info(f"deleted TransferRequest {request_id}") self.set_status(204) + response_counter.labels(method='DELETE', response='204', route='/TransferRequests/{uuid}').inc() class TransferRequestActionsPopHandler(BaseLTAHandler): @@ -582,9 +661,11 @@ class TransferRequestActionsPopHandler(BaseLTAHandler): @lta_auth(prefix=LTA_AUTH_PREFIX, roles=LTA_AUTH_ROLES) async def post(self) -> None: """Handle POST /TransferRequests/actions/pop.""" + request_counter.labels(method='POST', route='/TransferRequests/actions/pop').inc() source = self.get_argument("source", type=str) pop_body = json_decode(self.request.body) if 'claimant' not in pop_body: + response_counter.labels(method='POST', response='400', route='/TransferRequests/actions/pop').inc() raise tornado.web.HTTPError(400, reason="missing claimant field") claimant = pop_body["claimant"] # find and claim a transfer request for the specified source @@ -616,6 +697,7 @@ async def post(self) -> None: else: logging.info(f"TransferRequest {tr['uuid']} claimed by {claimant}") self.write({'transfer_request': tr}) + response_counter.labels(method='POST', response='200', route='/TransferRequests/actions/pop').inc() # ----------------------------------------------------------------------------- @@ -727,8 +809,18 @@ def start(debug: bool = False) -> RestServer: return server -def main() -> None: +async def main() -> None: + """Just loop forever while the REST server processes requests.""" + while True: + LOG.info("Sleeping for 60 seconds") + await asyncio.sleep(60) + + +def main_sync() -> None: """Configure logging and start a LTA DB service.""" + # obtain our configuration from the environment + config = from_environment(EXPECTED_CONFIG) + # configure logging for the application log_level = getattr(logging, os.getenv("LOG_LEVEL", default="DEBUG")) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", @@ -737,9 +829,10 @@ def main() -> None: style="{", ) start(debug=True) - loop = asyncio.get_event_loop() - loop.run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main()) if __name__ == '__main__': - main() + main_sync() diff --git a/lta/site_move_verifier.py b/lta/site_move_verifier.py index d7a560a..574f11f 100644 --- a/lta/site_move_verifier.py +++ b/lta/site_move_verifier.py @@ -8,13 +8,14 @@ import sys from typing import Any, Dict, List, Optional +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop from .crypto import sha512sum from .joiner import join_smart +from .lta_tools import from_environment from .lta_types import BundleType from .rest_server import boolify @@ -34,6 +35,11 @@ OLD_MTIME_EPOCH_SEC = 30 * 60 # 30 MINUTES * 60 SEC_PER_MIN +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + def as_nonempty_columns(s: str) -> List[str]: """Split the provided string into columns and return the non-empty ones.""" @@ -105,12 +111,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='site_move_verifier', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -137,7 +146,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._verify_bundle(lta_rc, bundle) + success_counter.labels(component='site_move_verifier', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='site_move_verifier', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") raise e # if we were successful at processing work, let the caller know @@ -217,12 +228,19 @@ def _execute_myquota(self) -> Optional[str]: return completed_process.stdout.decode("utf-8") -def runner() -> None: +async def main(site_move_verifier: SiteMoveVerifier) -> None: + """Execute the work loop of the SiteMoveVerifier component.""" + LOG.info("Starting asynchronous code") + await work_loop(site_move_verifier) + LOG.info("Ending asynchronous code") + + +def main_sync() -> None: """Configure a SiteMoveVerifier component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -230,18 +248,14 @@ def runner() -> None: style="{", ) # create our SiteMoveVerifier service - site_move_verifier = SiteMoveVerifier(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + site_move_verifier = SiteMoveVerifier(config, LOG) # let's get to work - site_move_verifier.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(site_move_verifier)) - - -def main() -> None: - """Configure a SiteMoveVerifier component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(site_move_verifier)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/transfer_request_finisher.py b/lta/transfer_request_finisher.py index 1fecba1..1886ca5 100644 --- a/lta/transfer_request_finisher.py +++ b/lta/transfer_request_finisher.py @@ -6,11 +6,12 @@ import sys from typing import Any, Dict, Optional, Union +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment import wipac_telemetry.tracing_tools as wtt from .component import COMMON_CONFIG, Component, now, work_loop +from .lta_tools import from_environment from .lta_types import BundleType Logger = logging.Logger @@ -23,6 +24,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class TransferRequestFinisher(Component): """ @@ -59,12 +65,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='transfer_request_finisher', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -143,6 +152,7 @@ async def _update_transfer_request(self, lta_rc: RestClient, bundle: BundleType) } self.logger.info(f"PATCH /TransferRequests/{request_uuid} - '{patch_body}'") await lta_rc.request('PATCH', f'/TransferRequests/{request_uuid}', patch_body) + success_counter.labels(component='transfer_request_finisher', level='transfer_request', type='work').inc() # update each of the constituent bundles to status "finished" for bundle_id in results: patch_body = { @@ -155,14 +165,22 @@ async def _update_transfer_request(self, lta_rc: RestClient, bundle: BundleType) } self.logger.info(f"PATCH /Bundles/{bundle_id} - '{patch_body}'") await lta_rc.request('PATCH', f'/Bundles/{bundle_id}', patch_body) + success_counter.labels(component='transfer_request_finisher', level='bundle', type='work').inc() + + +async def main(transfer_request_finisher: TransferRequestFinisher) -> None: + """Execute the work loop of the TransferRequestFinisher component.""" + LOG.info("Starting asynchronous code") + await work_loop(transfer_request_finisher) + LOG.info("Ending asynchronous code") -def runner() -> None: +def main_sync() -> None: """Configure a TransferRequestFinisher component from the environment and set it running.""" # obtain our configuration from the environment config = from_environment(EXPECTED_CONFIG) # configure logging for the application - log_level = getattr(logging, str(config["LOG_LEVEL"]).upper()) + log_level = getattr(logging, config["LOG_LEVEL"].upper()) logging.basicConfig( format="{asctime} [{threadName}] {levelname:5} ({filename}:{lineno}) - {message}", level=log_level, @@ -170,18 +188,14 @@ def runner() -> None: style="{", ) # create our TransferRequestFinisher service - transfer_request_finisher = TransferRequestFinisher(config, LOG) # type: ignore[arg-type] + LOG.info("Starting synchronous code") + transfer_request_finisher = TransferRequestFinisher(config, LOG) # let's get to work - transfer_request_finisher.logger.info("Adding tasks to asyncio loop") - loop = asyncio.get_event_loop() - loop.create_task(work_loop(transfer_request_finisher)) - - -def main() -> None: - """Configure a TransferRequestFinisher component from the environment and set it running.""" - runner() - asyncio.get_event_loop().run_forever() + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) + asyncio.run(main(transfer_request_finisher)) + LOG.info("Ending synchronous code") if __name__ == "__main__": - main() + main_sync() diff --git a/lta/unpacker.py b/lta/unpacker.py index 3525cd4..79b4263 100644 --- a/lta/unpacker.py +++ b/lta/unpacker.py @@ -11,6 +11,7 @@ from typing import Any, cast, Dict, Optional from zipfile import ZipFile +from prometheus_client import Counter, Gauge, start_http_server from rest_tools.client import ClientCredentialsAuth import wipac_telemetry.tracing_tools as wtt @@ -37,6 +38,11 @@ "WORK_TIMEOUT_SECONDS": "30", }) +# prometheus metrics +failure_counter = Counter('lta_failures', 'lta processing failures', ['component', 'level', 'type']) +load_gauge = Gauge('lta_load_level', 'lta work processed', ['component', 'level', 'type']) +success_counter = Counter('lta_successes', 'lta processing successes', ['component', 'level', 'type']) + class Unpacker(Component): """ @@ -82,12 +88,15 @@ def _expected_config(self) -> Dict[str, Optional[str]]: async def _do_work(self) -> None: """Perform a work cycle for this component.""" self.logger.info("Starting work on Bundles.") + load_level = -1 work_claimed = True while work_claimed: + load_level += 1 work_claimed = await self._do_work_claim() # if we are configured to run once and die, then die if self.run_once_and_die: sys.exit() + load_gauge.labels(component='unpacker', level='bundle', type='work').set(load_level) self.logger.info("Ending work on Bundles.") @wtt.spanned() @@ -114,7 +123,9 @@ async def _do_work_claim(self) -> bool: # process the Bundle that we were given try: await self._do_work_bundle(lta_rc, bundle) + success_counter.labels(component='unpacker', level='bundle', type='work').inc() except Exception as e: + failure_counter.labels(component='unpacker', level='bundle', type='exception').inc() await self._quarantine_bundle(lta_rc, bundle, f"{e}") raise e # signal the work was processed successfully @@ -374,6 +385,8 @@ def main_sync() -> None: LOG.info("Starting synchronous code") unpacker = Unpacker(config, LOG) # let's get to work + metrics_port = int(config["PROMETHEUS_METRICS_PORT"]) + start_http_server(metrics_port) asyncio.run(main(unpacker)) LOG.info("Ending synchronous code") diff --git a/requirements-dev.txt b/requirements-dev.txt index 4dadddc..61a63e5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -34,7 +34,7 @@ coverage[toml]==7.3.1 # via pytest-cov crayons==0.4.0 # via pycycle -cryptography==41.0.3 +cryptography==41.0.4 # via pyjwt deprecated==1.2.14 # via @@ -52,7 +52,7 @@ googleapis-common-protos==1.56.2 # via # opentelemetry-exporter-jaeger-proto-grpc # opentelemetry-exporter-otlp-proto-http -grpcio==1.57.0 +grpcio==1.58.0 # via opentelemetry-exporter-jaeger-proto-grpc humanfriendly==10.0 # via coloredlogs @@ -132,7 +132,7 @@ pymongo==4.5.0 # motor pypng==0.20220715.0 # via qrcode -pytest==7.4.1 +pytest==7.4.2 # via # lta (setup.py) # pycycle @@ -173,18 +173,18 @@ tornado==6.3.3 # via # lta (setup.py) # wipac-rest-tools -types-requests==2.31.0.2 +types-requests==2.31.0.6 # via lta (setup.py) types-urllib3==1.26.25.14 # via types-requests -typing-extensions==4.7.1 +typing-extensions==4.8.0 # via # mypy # opentelemetry-sdk # qrcode # wipac-dev-tools # wipac-telemetry -urllib3==2.0.4 +urllib3==2.0.5 # via # requests # wipac-rest-tools @@ -199,7 +199,7 @@ wipac-telemetry==0.2.7 # via lta (setup.py) wrapt==1.15.0 # via deprecated -zipp==3.16.2 +zipp==3.17.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements-monitoring.txt b/requirements-monitoring.txt index 3be4d20..a6bfe98 100644 --- a/requirements-monitoring.txt +++ b/requirements-monitoring.txt @@ -34,7 +34,7 @@ colorama==0.4.6 # via lta (setup.py) coloredlogs==15.0.1 # via wipac-telemetry -cryptography==41.0.3 +cryptography==41.0.4 # via pyjwt deprecated==1.2.14 # via @@ -54,7 +54,7 @@ googleapis-common-protos==1.56.2 # via # opentelemetry-exporter-jaeger-proto-grpc # opentelemetry-exporter-otlp-proto-http -grpcio==1.57.0 +grpcio==1.58.0 # via opentelemetry-exporter-jaeger-proto-grpc humanfriendly==10.0 # via coloredlogs @@ -136,13 +136,13 @@ tornado==6.3.3 # via # lta (setup.py) # wipac-rest-tools -typing-extensions==4.7.1 +typing-extensions==4.8.0 # via # opentelemetry-sdk # qrcode # wipac-dev-tools # wipac-telemetry -urllib3==2.0.4 +urllib3==2.0.5 # via # elasticsearch # requests @@ -162,7 +162,7 @@ yarl==1.9.2 # via # aiohttp # elasticsearch -zipp==3.16.2 +zipp==3.17.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements.txt b/requirements.txt index 7edf42c..5ceab7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ colorama==0.4.6 # via lta (setup.py) coloredlogs==15.0.1 # via wipac-telemetry -cryptography==41.0.3 +cryptography==41.0.4 # via pyjwt deprecated==1.2.14 # via @@ -36,7 +36,7 @@ googleapis-common-protos==1.56.2 # via # opentelemetry-exporter-jaeger-proto-grpc # opentelemetry-exporter-otlp-proto-http -grpcio==1.57.0 +grpcio==1.58.0 # via opentelemetry-exporter-jaeger-proto-grpc humanfriendly==10.0 # via coloredlogs @@ -112,13 +112,13 @@ tornado==6.3.3 # via # lta (setup.py) # wipac-rest-tools -typing-extensions==4.7.1 +typing-extensions==4.8.0 # via # opentelemetry-sdk # qrcode # wipac-dev-tools # wipac-telemetry -urllib3==2.0.4 +urllib3==2.0.5 # via # requests # wipac-rest-tools @@ -133,7 +133,7 @@ wipac-telemetry==0.2.7 # via lta (setup.py) wrapt==1.15.0 # via deprecated -zipp==3.16.2 +zipp==3.17.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/tests/test_bundler.py b/tests/test_bundler.py index f9df6b9..82a6a93 100644 --- a/tests/test_bundler.py +++ b/tests/test_bundler.py @@ -1,6 +1,18 @@ # test_bundler.py """Unit tests for lta/bundler.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + import os from typing import Dict from unittest.mock import AsyncMock, call, mock_open, patch @@ -11,7 +23,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.bundler import Bundler, main +from lta.bundler import Bundler, main_sync TestConfig = Dict[str, str] @@ -39,6 +51,7 @@ def config() -> TestConfig: "MYSQL_PORT": "23306", "MYSQL_USER": "jade-user", "OUTPUT_STATUS": "created", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -104,9 +117,9 @@ def test_do_status(config: TestConfig, mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, - mocker: MockerFixture, - monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, + mocker: MockerFixture, + monkeypatch: MonkeyPatch) -> None: """ Verify Bundler component behavior when run as a script. @@ -115,11 +128,14 @@ async def test_script_main(config: TestConfig, """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.bundler.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.bundler.main") + mock_shs = mocker.patch("lta.bundler.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio @@ -146,6 +162,7 @@ async def test_bundler_logs_configuration(mocker: MockerFixture) -> None: "MYSQL_PORT": "23306", "MYSQL_USER": "logme-jade-user", "OUTPUT_STATUS": "created", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -175,6 +192,7 @@ async def test_bundler_logs_configuration(mocker: MockerFixture) -> None: call('MYSQL_PORT = 23306'), call('MYSQL_USER = logme-jade-user'), call('OUTPUT_STATUS = created'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), diff --git a/tests/test_deleter.py b/tests/test_deleter.py index cc3f75a..f660218 100644 --- a/tests/test_deleter.py +++ b/tests/test_deleter.py @@ -1,6 +1,18 @@ # test_deleter.py """Unit tests for lta/deleter.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call, MagicMock @@ -9,7 +21,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.deleter import main, Deleter +from lta.deleter import main_sync, Deleter TestConfig = Dict[str, str] @@ -28,6 +40,7 @@ def config() -> TestConfig: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "source-deleted", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -76,6 +89,7 @@ async def test_deleter_logs_configuration(mocker: MockerFixture) -> None: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "source-deleted", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -96,6 +110,7 @@ async def test_deleter_logs_configuration(mocker: MockerFixture) -> None: call('LTA_AUTH_OPENID_URL = localhost:12345'), call('LTA_REST_URL = localhost:12347'), call('OUTPUT_STATUS = source-deleted'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), @@ -118,11 +133,14 @@ async def test_script_main(config: TestConfig, """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.deleter.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.deleter.main") + mock_shs = mocker.patch("lta.deleter.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_desy_move_verifier.py b/tests/test_desy_move_verifier.py index 11c6e34..7c13957 100644 --- a/tests/test_desy_move_verifier.py +++ b/tests/test_desy_move_verifier.py @@ -1,6 +1,18 @@ # test_desy_move_verifier.py """Unit tests for lta/desy_move_verifier.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call @@ -9,7 +21,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.desy_move_verifier import main, DesyMoveVerifier +from lta.desy_move_verifier import main_sync, DesyMoveVerifier TestConfig = Dict[str, str] @@ -29,6 +41,7 @@ def config() -> TestConfig: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "taping", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -82,6 +95,7 @@ async def test_desy_move_verifier_logs_configuration(mocker: MockerFixture) -> N "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "taping", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -104,6 +118,7 @@ async def test_desy_move_verifier_logs_configuration(mocker: MockerFixture) -> N call('LTA_AUTH_OPENID_URL = localhost:12345'), call('LTA_REST_URL = localhost:12347'), call('OUTPUT_STATUS = taping'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), @@ -116,7 +131,7 @@ async def test_desy_move_verifier_logs_configuration(mocker: MockerFixture) -> N @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify DesyMoveVerifier component behavior when run as a script. @@ -125,11 +140,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.desy_move_verifier.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.desy_move_verifier.main") + mock_shs = mocker.patch("lta.desy_move_verifier.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_desy_verifier.py b/tests/test_desy_verifier.py index 8d27dc0..8cfc1cf 100644 --- a/tests/test_desy_verifier.py +++ b/tests/test_desy_verifier.py @@ -1,6 +1,18 @@ # test_desy_verifier.py """Unit tests for lta/desy_verifier.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call from uuid import uuid1 @@ -10,7 +22,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.desy_verifier import main, DesyVerifier +from lta.desy_verifier import main_sync, DesyVerifier TestConfig = Dict[str, str] @@ -33,6 +45,7 @@ def config() -> TestConfig: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "completed", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -88,6 +101,7 @@ async def test_desy_verifier_logs_configuration(mocker: MockerFixture) -> None: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "completed", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -114,6 +128,7 @@ async def test_desy_verifier_logs_configuration(mocker: MockerFixture) -> None: call('LTA_AUTH_OPENID_URL = localhost:12345'), call('LTA_REST_URL = localhost:12347'), call('OUTPUT_STATUS = completed'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), @@ -127,7 +142,7 @@ async def test_desy_verifier_logs_configuration(mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify DesyVerifier component behavior when run as a script. @@ -136,11 +151,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.desy_verifier.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.desy_verifier.main") + mock_shs = mocker.patch("lta.desy_verifier.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_locator.py b/tests/test_locator.py index 4cdd443..1cf4fc3 100644 --- a/tests/test_locator.py +++ b/tests/test_locator.py @@ -1,6 +1,18 @@ # test_locator.py """Unit tests for lta/locator.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from math import floor from secrets import token_hex from typing import Any, Dict, List, Union @@ -12,7 +24,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.locator import as_lta_record, main, Locator +from lta.locator import as_lta_record, main_sync, Locator TestConfig = Dict[str, str] @@ -35,6 +47,7 @@ def config() -> TestConfig: "LTA_REST_URL": "localhost:12347", "LTA_SITE_CONFIG": "examples/site.json", "OUTPUT_STATUS": "located", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "NERSC", @@ -100,7 +113,7 @@ def test_do_status(config: TestConfig, mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify Locator component behavior when run as a script. @@ -109,11 +122,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.locator.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.locator.main") + mock_shs = mocker.patch("lta.locator.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio @@ -135,6 +151,7 @@ async def test_locator_logs_configuration(mocker: MockerFixture) -> None: "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "LTA_SITE_CONFIG": "examples/site.json", "OUTPUT_STATUS": "located", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "NERSC", @@ -159,6 +176,7 @@ async def test_locator_logs_configuration(mocker: MockerFixture) -> None: call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('LTA_SITE_CONFIG = examples/site.json'), call('OUTPUT_STATUS = located'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = NERSC'), diff --git a/tests/test_nersc_mover.py b/tests/test_nersc_mover.py index c6c38b9..6341c4f 100644 --- a/tests/test_nersc_mover.py +++ b/tests/test_nersc_mover.py @@ -1,6 +1,18 @@ # test_nersc_mover.py """Unit tests for lta/nersc_mover.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call, MagicMock @@ -9,7 +21,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.nersc_mover import main, NerscMover +from lta.nersc_mover import main_sync, NerscMover from .test_util import ObjectLiteral TestConfig = Dict[str, str] @@ -28,8 +40,9 @@ def config() -> TestConfig: "LOG_LEVEL": "DEBUG", "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", - "OUTPUT_STATUS": "verifying", "MAX_COUNT": "5", + "OUTPUT_STATUS": "verifying", + "PROMETHEUS_METRICS_PORT": "8080", "RSE_BASE_PATH": "/path/to/rse", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", @@ -95,7 +108,7 @@ def test_do_status(config: TestConfig, mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify NerscMover component behavior when run as a script. @@ -104,11 +117,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.nersc_mover.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.nersc_mover.main") + mock_shs = mocker.patch("lta.nersc_mover.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio @@ -127,6 +143,7 @@ async def test_nersc_mover_logs_configuration(mocker: MockerFixture) -> None: "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "MAX_COUNT": "9001", "OUTPUT_STATUS": "verifying", + "PROMETHEUS_METRICS_PORT": "8080", "RSE_BASE_PATH": "/log/me/path/to/rse", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", @@ -150,6 +167,7 @@ async def test_nersc_mover_logs_configuration(mocker: MockerFixture) -> None: call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('MAX_COUNT = 9001'), call('OUTPUT_STATUS = verifying'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RSE_BASE_PATH = /log/me/path/to/rse'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), diff --git a/tests/test_nersc_retriever.py b/tests/test_nersc_retriever.py index b8a5a1a..8bc29ea 100644 --- a/tests/test_nersc_retriever.py +++ b/tests/test_nersc_retriever.py @@ -1,6 +1,18 @@ # test_nersc_retriever.py """Unit tests for lta/nersc_retriever.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call, MagicMock @@ -9,7 +21,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.nersc_retriever import main, NerscRetriever +from lta.nersc_retriever import main_sync, NerscRetriever from .test_util import ObjectLiteral TestConfig = Dict[str, str] @@ -30,6 +42,7 @@ def config() -> TestConfig: "LTA_REST_URL": "localhost:12347", "MAX_COUNT": "5", "OUTPUT_STATUS": "staged", + "PROMETHEUS_METRICS_PORT": "8080", "RSE_BASE_PATH": "/path/to/rse", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", @@ -95,7 +108,7 @@ def test_do_status(config: TestConfig, mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify NerscRetriever component behavior when run as a script. @@ -104,11 +117,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.nersc_retriever.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.nersc_retriever.main") + mock_shs = mocker.patch("lta.nersc_retriever.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio @@ -127,6 +143,7 @@ async def test_nersc_retriever_logs_configuration(mocker: MockerFixture) -> None "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "MAX_COUNT": "9001", "OUTPUT_STATUS": "staged", + "PROMETHEUS_METRICS_PORT": "8080", "RSE_BASE_PATH": "/log/me/path/to/rse", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", @@ -150,6 +167,7 @@ async def test_nersc_retriever_logs_configuration(mocker: MockerFixture) -> None call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('MAX_COUNT = 9001'), call('OUTPUT_STATUS = staged'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RSE_BASE_PATH = /log/me/path/to/rse'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), diff --git a/tests/test_nersc_verifier.py b/tests/test_nersc_verifier.py index c04f337..2f5763b 100644 --- a/tests/test_nersc_verifier.py +++ b/tests/test_nersc_verifier.py @@ -1,6 +1,18 @@ # test_nersc_verifier.py """Unit tests for lta/nersc_verifier.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call, MagicMock from uuid import uuid1 @@ -10,7 +22,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.nersc_verifier import main, NerscVerifier +from lta.nersc_verifier import main_sync, NerscVerifier from .test_util import ObjectLiteral TestConfig = Dict[str, str] @@ -33,6 +45,7 @@ def config() -> TestConfig: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "completed", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -86,6 +99,7 @@ async def test_nersc_verifier_logs_configuration(mocker: MockerFixture) -> None: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "OUTPUT_STATUS": "completed", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -110,6 +124,7 @@ async def test_nersc_verifier_logs_configuration(mocker: MockerFixture) -> None: call('LTA_AUTH_OPENID_URL = localhost:12345'), call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('OUTPUT_STATUS = completed'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), @@ -122,7 +137,7 @@ async def test_nersc_verifier_logs_configuration(mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify NerscVerifier component behavior when run as a script. @@ -131,11 +146,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.nersc_verifier.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.nersc_verifier.main") + mock_shs = mocker.patch("lta.nersc_verifier.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_picker.py b/tests/test_picker.py index 3b14d5f..87d7092 100644 --- a/tests/test_picker.py +++ b/tests/test_picker.py @@ -1,6 +1,18 @@ # test_picker.py """Unit tests for lta/picker.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from secrets import token_hex from typing import Any, Dict, List, Union from unittest.mock import AsyncMock, call, MagicMock @@ -11,7 +23,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.picker import CREATE_CHUNK_SIZE, main, Picker +from lta.picker import CREATE_CHUNK_SIZE, main_sync, Picker TestConfig = Dict[str, str] @@ -34,8 +46,9 @@ def config() -> TestConfig: "LOG_LEVEL": "DEBUG", "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", - "OUTPUT_STATUS": "specified", "MAX_BUNDLE_SIZE": "107374182400", # 100 GiB + "OUTPUT_STATUS": "specified", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -101,7 +114,7 @@ def test_do_status(config: TestConfig, mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify Picker component behavior when run as a script. @@ -110,11 +123,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.picker.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.picker.main") + mock_shs = mocker.patch("lta.picker.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio @@ -136,6 +152,7 @@ async def test_picker_logs_configuration(mocker: MockerFixture) -> None: "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "MAX_BUNDLE_SIZE": "107374182400", # 100 GiB "OUTPUT_STATUS": "specified", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -160,6 +177,7 @@ async def test_picker_logs_configuration(mocker: MockerFixture) -> None: call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('MAX_BUNDLE_SIZE = 107374182400'), call('OUTPUT_STATUS = specified'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), diff --git a/tests/test_rate_limiter.py b/tests/test_rate_limiter.py index ad5c0d3..74a1c45 100644 --- a/tests/test_rate_limiter.py +++ b/tests/test_rate_limiter.py @@ -1,6 +1,18 @@ # test_rate_limiter.py """Unit tests for lta/rate_limiter.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call, MagicMock @@ -9,7 +21,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.rate_limiter import main, RateLimiter +from lta.rate_limiter import main_sync, RateLimiter TestConfig = Dict[str, str] @@ -30,6 +42,7 @@ def config() -> TestConfig: "OUTPUT_PATH": "/path/to/icecube/replicator/inbox", "OUTPUT_QUOTA": "12094627905536", # 11 TiB "OUTPUT_STATUS": "staged", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -84,6 +97,7 @@ async def test_rate_limiter_logs_configuration(mocker: MockerFixture) -> None: "OUTPUT_PATH": "/path/to/icecube/replicator/inbox", "OUTPUT_QUOTA": "12094627905536", # 11 TiB "OUTPUT_STATUS": "staged", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -106,6 +120,7 @@ async def test_rate_limiter_logs_configuration(mocker: MockerFixture) -> None: call('OUTPUT_PATH = /path/to/icecube/replicator/inbox'), call('OUTPUT_QUOTA = 12094627905536'), call('OUTPUT_STATUS = staged'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), @@ -117,7 +132,7 @@ async def test_rate_limiter_logs_configuration(mocker: MockerFixture) -> None: @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify RateLimiter component behavior when run as a script. @@ -126,11 +141,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.rate_limiter.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.rate_limiter.main") + mock_shs = mocker.patch("lta.rate_limiter.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_rest_server.py b/tests/test_rest_server.py index c484f02..4865a63 100644 --- a/tests/test_rest_server.py +++ b/tests/test_rest_server.py @@ -18,7 +18,7 @@ from rest_tools.utils import Auth from requests.exceptions import HTTPError -from lta.rest_server import boolify, main, start, unique_id +from lta.rest_server import boolify, main_sync, start, unique_id LtaCollection = Database[Dict[str, Any]] RestClientFactory = Callable[[str, float], RestClient] @@ -38,6 +38,7 @@ 'LTA_MONGODB_HOST': 'localhost', 'LTA_MONGODB_PORT': '27017', 'OTEL_EXPORTER_OTLP_ENDPOINT': 'localhost:4317', + 'PROMETHEUS_METRICS_PORT': '8090', 'WIPACTEL_EXPORT_STDOUT': 'TRUE', } for k in CONFIG: @@ -85,6 +86,7 @@ async def rest(monkeypatch: MonkeyPatch, port: int) -> AsyncGenerator[RestClient monkeypatch.setenv("LTA_REST_PORT", str(port)) monkeypatch.setenv("LTA_SITE_CONFIG", "examples/site.json") monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317") + monkeypatch.setenv("PROMETHEUS_METRICS_PORT", "8090") monkeypatch.setenv("WIPACTEL_EXPORT_STDOUT", "TRUE") s = start(debug=True) @@ -276,15 +278,20 @@ async def test_transfer_request_pop(rest: RestClientFactory) -> None: @pytest.mark.asyncio -async def test_script_main(mocker: MockerFixture) -> None: +async def test_script_main_sync(mocker: MockerFixture) -> None: """Ensure that main sets up logging, starts a server, and runs the event loop.""" mock_root_logger = mocker.patch("logging.basicConfig") mock_rest_server = mocker.patch("lta.rest_server.start") - mock_event_loop = mocker.patch("asyncio.get_event_loop") - main() - mock_root_logger.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.rest_server.main") + mock_shs = mocker.patch("lta.rest_server.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() mock_rest_server.assert_called() - mock_event_loop.assert_called() + mock_root_logger.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_site_move_verifier.py b/tests/test_site_move_verifier.py index f2d7f53..af25bd2 100644 --- a/tests/test_site_move_verifier.py +++ b/tests/test_site_move_verifier.py @@ -1,6 +1,18 @@ # test_site_move_verifier.py """Unit tests for lta/site_move_verifier.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call, MagicMock @@ -10,7 +22,7 @@ from tornado.web import HTTPError from lta.site_move_verifier import as_nonempty_columns, discard_empty, MYQUOTA_ARGS, parse_myquota -from lta.site_move_verifier import main, SiteMoveVerifier +from lta.site_move_verifier import main_sync, SiteMoveVerifier from .test_util import ObjectLiteral TestConfig = Dict[str, str] @@ -30,6 +42,7 @@ def config() -> TestConfig: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "taping", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -161,6 +174,7 @@ async def test_site_move_verifier_logs_configuration(mocker: MockerFixture) -> N "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "OUTPUT_STATUS": "taping", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "WIPAC", @@ -182,6 +196,7 @@ async def test_site_move_verifier_logs_configuration(mocker: MockerFixture) -> N call('LTA_AUTH_OPENID_URL = localhost:12345'), call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('OUTPUT_STATUS = taping'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = WIPAC'), @@ -194,7 +209,7 @@ async def test_site_move_verifier_logs_configuration(mocker: MockerFixture) -> N @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify SiteMoveVerifier component behavior when run as a script. @@ -203,11 +218,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.site_move_verifier.work_loop") - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.site_move_verifier.main") + mock_shs = mocker.patch("lta.site_move_verifier.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_transfer_request_finisher.py b/tests/test_transfer_request_finisher.py index 42e998b..d0dbf22 100644 --- a/tests/test_transfer_request_finisher.py +++ b/tests/test_transfer_request_finisher.py @@ -1,6 +1,18 @@ # test_transfer_request_finisher.py """Unit tests for lta/transfer_request_finisher.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Dict from unittest.mock import AsyncMock, call @@ -9,7 +21,7 @@ from pytest_mock import MockerFixture from tornado.web import HTTPError -from lta.transfer_request_finisher import main, TransferRequestFinisher +from lta.transfer_request_finisher import main_sync, TransferRequestFinisher TestConfig = Dict[str, str] @@ -27,6 +39,7 @@ def config() -> TestConfig: "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "finished", + "PROMETHEUS_METRICS_PORT": "8080", "RUCIO_PASSWORD": "hunter2", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", @@ -73,6 +86,7 @@ async def test_transfer_request_finisher_logs_configuration(mocker: MockerFixtur "LTA_AUTH_OPENID_URL": "localhost:12345", "LTA_REST_URL": "logme-http://zjwdm5ggeEgS1tZDZy9l1DOZU53uiSO4Urmyb8xL0.com/", "OUTPUT_STATUS": "finished", + "PROMETHEUS_METRICS_PORT": "8080", "RUCIO_PASSWORD": "hunter3-electric-boogaloo", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", @@ -94,6 +108,7 @@ async def test_transfer_request_finisher_logs_configuration(mocker: MockerFixtur call('LTA_AUTH_OPENID_URL = localhost:12345'), call('LTA_REST_URL = logme-http://zjwdm5ggeEgS1tZDZy9l1DOZU53uiSO4Urmyb8xL0.com/'), call('OUTPUT_STATUS = finished'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUCIO_PASSWORD = hunter3-electric-boogaloo'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), @@ -107,7 +122,7 @@ async def test_transfer_request_finisher_logs_configuration(mocker: MockerFixtur @pytest.mark.asyncio -async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: +async def test_script_main_sync(config: TestConfig, mocker: MockerFixture, monkeypatch: MonkeyPatch) -> None: """ Verify TransferRequestFinisher component behavior when run as a script. @@ -116,11 +131,14 @@ async def test_script_main(config: TestConfig, mocker: MockerFixture, monkeypatc """ for key in config.keys(): monkeypatch.setenv(key, config[key]) - mock_event_loop = mocker.patch("asyncio.get_event_loop") - mock_work_loop = mocker.patch("lta.transfer_request_finisher.work_loop", new_callable=AsyncMock) - main() - mock_event_loop.assert_called() - mock_work_loop.assert_called() + mock_run = mocker.patch("asyncio.run") + mock_main = mocker.patch("lta.transfer_request_finisher.main") + mock_shs = mocker.patch("lta.transfer_request_finisher.start_http_server") + main_sync() + mock_shs.assert_called() + mock_main.assert_called() + mock_run.assert_called() + await mock_run.call_args.args[0] @pytest.mark.asyncio diff --git a/tests/test_unpacker.py b/tests/test_unpacker.py index 0b70712..247fbe0 100644 --- a/tests/test_unpacker.py +++ b/tests/test_unpacker.py @@ -1,6 +1,18 @@ # test_unpacker.py """Unit tests for lta/unpacker.py.""" +# ----------------------------------------------------------------------------- +# reset prometheus registry for unit tests +from prometheus_client import REGISTRY +collectors = list(REGISTRY._collector_to_names.keys()) +for collector in collectors: + REGISTRY.unregister(collector) +from prometheus_client import gc_collector, platform_collector, process_collector +process_collector.ProcessCollector() +platform_collector.PlatformCollector() +gc_collector.GCCollector() +# ----------------------------------------------------------------------------- + from typing import Any, Dict from unittest.mock import AsyncMock, call, MagicMock, mock_open, patch @@ -33,6 +45,7 @@ def config() -> TestConfig: "LTA_REST_URL": "localhost:12347", "OUTPUT_STATUS": "completed", "PATH_MAP_JSON": "/tmp/lta/testing/path_map.json", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "NERSC", @@ -126,7 +139,9 @@ async def test_script_main_sync(config: TestConfig, monkeypatch.setenv(key, config[key]) mock_run = mocker.patch("asyncio.run") mock_main = mocker.patch("lta.unpacker.main") + mock_prometheus = mocker.patch("lta.unpacker.start_http_server") main_sync() + mock_prometheus.assert_called_with(8080) mock_main.assert_called() mock_run.assert_called() await mock_run.call_args.args[0] @@ -169,6 +184,7 @@ async def test_unpacker_logs_configuration(mocker: MockerFixture, path_map_mock: "LTA_REST_URL": "logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/", "OUTPUT_STATUS": "completed", "PATH_MAP_JSON": "logme/tmp/lta/testing/path_map.json", + "PROMETHEUS_METRICS_PORT": "8080", "RUN_ONCE_AND_DIE": "False", "RUN_UNTIL_NO_WORK": "False", "SOURCE_SITE": "NERSC", @@ -195,6 +211,7 @@ async def test_unpacker_logs_configuration(mocker: MockerFixture, path_map_mock: call('LTA_REST_URL = logme-http://RmMNHdPhHpH2ZxfaFAC9d2jiIbf5pZiHDqy43rFLQiM.com/'), call('OUTPUT_STATUS = completed'), call('PATH_MAP_JSON = logme/tmp/lta/testing/path_map.json'), + call('PROMETHEUS_METRICS_PORT = 8080'), call('RUN_ONCE_AND_DIE = False'), call('RUN_UNTIL_NO_WORK = False'), call('SOURCE_SITE = NERSC'),