From 6b800b4c0320f554eee2226b818c13652894cf23 Mon Sep 17 00:00:00 2001 From: xxchan Date: Fri, 13 Dec 2024 17:51:39 +0800 Subject: [PATCH] feat: add actor info to diagnose & dump diagnose on CI failure (#19787) Signed-off-by: xxchan --- Makefile.toml | 15 +++++- ci/Dockerfile | 2 +- ci/build-ci-image.sh | 2 +- ci/docker-compose.yml | 12 ++--- ci/scripts/common.sh | 8 ++- ci/scripts/e2e-source-test.sh | 4 +- ci/scripts/e2e-test-parallel-for-opendal.sh | 4 +- ci/scripts/e2e-test-parallel-in-memory.sh | 4 +- ci/scripts/e2e-test-parallel.sh | 6 +-- e2e_test/source_inline/README.md | 2 +- .../kafka/protobuf/alter_source_shared.slt | 4 +- src/meta/src/controller/fragment.rs | 22 +++++++- src/meta/src/manager/diagnose.rs | 53 ++++++++++++++++++- src/risedevtool/src/risedev_env.rs | 8 +++ 14 files changed, 122 insertions(+), 24 deletions(-) diff --git a/Makefile.toml b/Makefile.toml index 400db75196108..89814dbe36ae5 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -196,6 +196,19 @@ script = ''' watch -n 1 "${TMUX} list-windows -t risedev | grep -v active | cut -d'(' -f1" ''' +[tasks.diagnose] +category = "Misc" +description = "Dump diagnose info" +dependencies = ["check-and-load-risedev-env-file"] +script = ''' +#!/usr/bin/env bash +set -e +file_name=${PREFIX_LOG}/diagnose-$(date -u +%Y-%m-%dT%H:%M:%SZ).txt +curl -s ${RISEDEV_RW_META_DASHBOARD_ADDR}/api/monitor/diagnose/ > ${file_name} +echo "Diagnose info has been dumped to ${file_name}" +''' + + [tasks.del] alias = "delete" @@ -1327,7 +1340,7 @@ echo "All processes has exited." [tasks.slt] category = "RiseDev - Test - SQLLogicTest" -install_crate = { min_version = "0.21.0", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [ +install_crate = { min_version = "0.23.1", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [ "--help", ], install_command = "binstall" } dependencies = ["check-and-load-risedev-env-file"] diff --git a/ci/Dockerfile b/ci/Dockerfile index 88fe2519252e9..21b9c30c678b0 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -70,7 +70,7 @@ ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash RUN cargo binstall -y --locked --no-symlinks cargo-llvm-cov cargo-nextest cargo-hakari cargo-sort cargo-cache cargo-audit \ cargo-make@0.37.9 \ - sqllogictest-bin@0.21.0 \ + sqllogictest-bin@0.23.1 \ sccache@0.7.4 \ && cargo cache -a \ && rm -rf "/root/.cargo/registry/index" \ diff --git a/ci/build-ci-image.sh b/ci/build-ci-image.sh index e08fc699df88f..86392d2ecd7d8 100755 --- a/ci/build-ci-image.sh +++ b/ci/build-ci-image.sh @@ -10,7 +10,7 @@ cat ../rust-toolchain # shellcheck disable=SC2155 # REMEMBER TO ALSO UPDATE ci/docker-compose.yml -export BUILD_ENV_VERSION=v20241030 +export BUILD_ENV_VERSION=v20241213 export BUILD_TAG="public.ecr.aws/w1p7b4n3/rw-build-env:${BUILD_ENV_VERSION}" diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index 636a18860af35..a9cb770e9d024 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -69,7 +69,7 @@ services: retries: 5 source-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213 depends_on: - mysql - sqlserver-server @@ -84,7 +84,7 @@ services: - ..:/risingwave sink-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213 depends_on: - mysql - db @@ -106,7 +106,7 @@ services: - ..:/risingwave rw-build-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213 volumes: - ..:/risingwave @@ -119,14 +119,14 @@ services: - ..:/risingwave iceberg-engine-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213 depends_on: - db volumes: - ..:/risingwave ci-flamegraph-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213 # NOTE(kwannoel): This is used in order to permit # syscalls for `nperf` (perf_event_open), # so it can do CPU profiling. @@ -137,7 +137,7 @@ services: - ..:/risingwave regress-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213 depends_on: db: condition: service_healthy diff --git a/ci/scripts/common.sh b/ci/scripts/common.sh index 4ced7b94b7674..b3e09bd607b28 100755 --- a/ci/scripts/common.sh +++ b/ci/scripts/common.sh @@ -19,6 +19,12 @@ export RW_SECRET_STORE_PRIVATE_KEY_HEX="0123456789abcdef0123456789abcdef" export RUST_MIN_STACK=4194304 unset LANG + +function dump_diagnose_info() { + ./risedev diagnose || true +} +trap dump_diagnose_info EXIT + if [ -n "${BUILDKITE_COMMIT:-}" ]; then export GIT_SHA=$BUILDKITE_COMMIT fi @@ -148,4 +154,4 @@ check_link_info() { echo "libssl should not be dynamically linked" exit 1 fi -} \ No newline at end of file +} diff --git a/ci/scripts/e2e-source-test.sh b/ci/scripts/e2e-source-test.sh index 88090c50d9aef..a52e7be793911 100755 --- a/ci/scripts/e2e-source-test.sh +++ b/ci/scripts/e2e-source-test.sh @@ -38,7 +38,7 @@ apt-get -y install jq echo "--- e2e, inline test" RUST_LOG="debug,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info,risingwave_meta=info" \ risedev ci-start ci-inline-source-test -risedev slt './e2e_test/source_inline/**/*.slt' -j16 +risedev slt './e2e_test/source_inline/**/*.slt' --keep-db-on-failure -j16 risedev slt './e2e_test/source_inline/**/*.slt.serial' echo "--- Kill cluster" risedev ci-kill @@ -172,4 +172,4 @@ sleep 20 risedev slt "e2e_test/webhook/webhook_source_recovery.slt" risedev ci-kill -echo "--- cluster killed " \ No newline at end of file +echo "--- cluster killed " diff --git a/ci/scripts/e2e-test-parallel-for-opendal.sh b/ci/scripts/e2e-test-parallel-for-opendal.sh index a6a6c89e41164..b880ecbb106d0 100755 --- a/ci/scripts/e2e-test-parallel-for-opendal.sh +++ b/ci/scripts/e2e-test-parallel-for-opendal.sh @@ -31,7 +31,7 @@ host_args=(-h localhost -p 4565 -h localhost -p 4566 -h localhost -p 4567) echo "--- e2e, ci-3cn-3fe-opendal-fs-backend, streaming" RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \ risedev ci-start ci-3cn-3fe-opendal-fs-backend -sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-opendal-fs-backend-${profile}" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-opendal-fs-backend-${profile}" --label "parallel" echo "--- Kill cluster Streaming" risedev ci-kill @@ -42,7 +42,7 @@ echo "--- e2e, ci-3cn-3fe-opendal-fs-backend, batch" RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \ risedev ci-start ci-3cn-3fe-opendal-fs-backend sqllogictest "${host_args[@]}" -d dev './e2e_test/ddl/**/*.slt' --junit "parallel-opendal-fs-backend-ddl-${profile}" --label "parallel" -sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' -j 16 --junit "parallel-opendal-fs-backend-batch-${profile}" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' --keep-db-on-failure -j 16 --junit "parallel-opendal-fs-backend-batch-${profile}" --label "parallel" echo "--- Kill cluster Batch" risedev ci-kill diff --git a/ci/scripts/e2e-test-parallel-in-memory.sh b/ci/scripts/e2e-test-parallel-in-memory.sh index f7e6292bed54e..407fd3f61feca 100755 --- a/ci/scripts/e2e-test-parallel-in-memory.sh +++ b/ci/scripts/e2e-test-parallel-in-memory.sh @@ -28,7 +28,7 @@ host_args=(-h localhost -p 4565 -h localhost -p 4566 -h localhost -p 4567) echo "--- e2e, ci-3cn-3fe-in-memory, streaming" risedev ci-start ci-3cn-3fe-in-memory sqllogictest --version -sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-in-memory-streaming-${profile}" --label "in-memory" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-in-memory-streaming-${profile}" --label "in-memory" --label "parallel" echo "--- Kill cluster" risedev ci-kill @@ -36,7 +36,7 @@ risedev ci-kill echo "--- e2e, ci-3cn-3fe-in-memory, batch" risedev ci-start ci-3cn-3fe-in-memory sqllogictest "${host_args[@]}" -d dev './e2e_test/ddl/**/*.slt' --junit "parallel-in-memory-batch-ddl-${profile}" --label "in-memory" --label "parallel" -sqllogictest "${host_args[@]}" -d dev './e2e_test/batch/**/*.slt' -j 16 --junit "parallel-in-memory-batch-${profile}" --label "in-memory" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/batch/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-in-memory-batch-${profile}" --label "in-memory" --label "parallel" echo "--- Kill cluster" risedev ci-kill diff --git a/ci/scripts/e2e-test-parallel.sh b/ci/scripts/e2e-test-parallel.sh index effe8e364dd24..c455764deb671 100755 --- a/ci/scripts/e2e-test-parallel.sh +++ b/ci/scripts/e2e-test-parallel.sh @@ -38,7 +38,7 @@ RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=i echo "--- e2e, ci-3streaming-2serving-3fe, streaming" RUST_LOG=$RUST_LOG \ risedev ci-start ci-3streaming-2serving-3fe -sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-streaming-${profile}" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-streaming-${profile}" --label "parallel" kill_cluster @@ -47,13 +47,13 @@ RUST_LOG=$RUST_LOG \ risedev ci-start ci-3streaming-2serving-3fe # Exclude files that contain ALTER SYSTEM commands find ./e2e_test/ddl -name "*.slt" -type f -exec grep -L "ALTER SYSTEM" {} \; | xargs -r sqllogictest "${host_args[@]}" -d dev --junit "parallel-batch-ddl-${profile}" --label "parallel" -sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' -j 16 --junit "parallel-batch-${profile}" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' --keep-db-on-failure -j 16 --junit "parallel-batch-${profile}" --label "parallel" kill_cluster echo "--- e2e, ci-3streaming-2serving-3fe, generated" RUST_LOG=$RUST_LOG \ risedev ci-start ci-3streaming-2serving-3fe -sqllogictest "${host_args[@]}" -d dev './e2e_test/generated/**/*.slt' -j 16 --junit "parallel-generated-${profile}" --label "parallel" +sqllogictest "${host_args[@]}" -d dev './e2e_test/generated/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-generated-${profile}" --label "parallel" kill_cluster diff --git a/e2e_test/source_inline/README.md b/e2e_test/source_inline/README.md index fa8cf25c56b0f..c5b938b45caca 100644 --- a/e2e_test/source_inline/README.md +++ b/e2e_test/source_inline/README.md @@ -13,7 +13,7 @@ Other tests can be run in parallel. ```bash # run all parallel tests -risedev slt './e2e_test/source_inline/**/*.slt' -j16 +risedev slt './e2e_test/source_inline/**/*.slt' --keep-db-on-failure -j16 # run all serial tests risedev slt './e2e_test/source_inline/**/*.slt.serial' ``` diff --git a/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt b/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt index 46a7c1aad3d06..e4edec6d535dc 100644 --- a/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt +++ b/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt @@ -99,5 +99,5 @@ SELECT COUNT(*), MAX(age), MIN(age), SUM(age) FROM mv_user_more; 30 104 0 1020 -# statement ok -# DROP SOURCE src_user CASCADE; +statement ok +DROP SOURCE src_user CASCADE; diff --git a/src/meta/src/controller/fragment.rs b/src/meta/src/controller/fragment.rs index a35de428130cd..a9a536f0b394d 100644 --- a/src/meta/src/controller/fragment.rs +++ b/src/meta/src/controller/fragment.rs @@ -28,7 +28,8 @@ use risingwave_meta_model::prelude::{Actor, Fragment, Sink, StreamingJob}; use risingwave_meta_model::{ actor, actor_dispatcher, fragment, object, sink, source, streaming_job, table, ActorId, ActorUpstreamActors, ConnectorSplits, DatabaseId, ExprContext, FragmentId, I32Array, JobStatus, - ObjectId, SinkId, SourceId, StreamNode, StreamingParallelism, TableId, VnodeBitmap, WorkerId, + ObjectId, SchemaId, SinkId, SourceId, StreamNode, StreamingParallelism, TableId, VnodeBitmap, + WorkerId, }; use risingwave_meta_model_migration::{Alias, SelectStatement}; use risingwave_pb::common::PbActorLocation; @@ -882,6 +883,25 @@ impl CatalogController { Ok(actor_locations) } + pub async fn list_actor_info( + &self, + ) -> MetaResult> { + let inner = self.inner.read().await; + let actor_locations: Vec<(ActorId, FragmentId, ObjectId, SchemaId, ObjectType)> = + Actor::find() + .join(JoinType::LeftJoin, actor::Relation::Fragment.def()) + .join(JoinType::LeftJoin, fragment::Relation::Object.def()) + .select_only() + .columns([actor::Column::ActorId, actor::Column::FragmentId]) + .column_as(object::Column::Oid, "job_id") + .column_as(object::Column::SchemaId, "schema_id") + .column_as(object::Column::ObjType, "type") + .into_tuple() + .all(&inner.db) + .await?; + Ok(actor_locations) + } + pub async fn list_source_actors(&self) -> MetaResult> { let inner = self.inner.read().await; diff --git a/src/meta/src/manager/diagnose.rs b/src/meta/src/manager/diagnose.rs index 33a97537ab467..062483374aa31 100644 --- a/src/meta/src/manager/diagnose.rs +++ b/src/meta/src/manager/diagnose.rs @@ -13,7 +13,7 @@ // limitations under the License. use std::cmp::{Ordering, Reverse}; -use std::collections::{BTreeMap, BinaryHeap}; +use std::collections::{BTreeMap, BinaryHeap, HashMap}; use std::fmt::Write; use std::sync::Arc; @@ -677,6 +677,7 @@ impl DiagnoseCommand { ("INDEX", indexes), ("SINK", sinks), ]; + let mut obj_id_to_name = HashMap::new(); for (title, items) in catalogs { use comfy_table::{Row, Table}; let mut table = Table::new(); @@ -689,6 +690,7 @@ impl DiagnoseCommand { row }); for (id, (name, schema_id, definition)) in items { + obj_id_to_name.insert(id, name.clone()); let mut row = Row::new(); let may_redact = redact_all_sql_options(&definition).unwrap_or_else(|| "[REDACTED]".into()); @@ -702,6 +704,55 @@ impl DiagnoseCommand { let _ = writeln!(s, "{title}"); let _ = writeln!(s, "{table}"); } + + let actors = self + .metadata_manager + .catalog_controller + .list_actor_info() + .await? + .into_iter() + .map(|(actor_id, fragment_id, job_id, schema_id, obj_type)| { + ( + actor_id, + ( + fragment_id, + job_id, + schema_id, + obj_type, + obj_id_to_name + .get(&(job_id as _)) + .cloned() + .unwrap_or_default(), + ), + ) + }) + .collect::>(); + + use comfy_table::{Row, Table}; + let mut table = Table::new(); + table.set_header({ + let mut row = Row::new(); + row.add_cell("id".into()); + row.add_cell("fragment_id".into()); + row.add_cell("job_id".into()); + row.add_cell("schema_id".into()); + row.add_cell("type".into()); + row.add_cell("name".into()); + row + }); + for (actor_id, (fragment_id, job_id, schema_id, ddl_type, name)) in actors { + let mut row = Row::new(); + row.add_cell(actor_id.into()); + row.add_cell(fragment_id.into()); + row.add_cell(job_id.into()); + row.add_cell(schema_id.into()); + row.add_cell(ddl_type.as_str().into()); + row.add_cell(name.into()); + table.add_row(row); + } + let _ = writeln!(s); + let _ = writeln!(s, "ACTOR"); + let _ = writeln!(s, "{table}"); Ok(()) } } diff --git a/src/risedevtool/src/risedev_env.rs b/src/risedevtool/src/risedev_env.rs index 5d7c113b6bc30..c955cb9a3fbc8 100644 --- a/src/risedevtool/src/risedev_env.rs +++ b/src/risedevtool/src/risedev_env.rs @@ -136,6 +136,14 @@ pub fn generate_risedev_env(services: &Vec) -> String { ) .unwrap(); } + ServiceConfig::MetaNode(meta_node_config) => { + writeln!( + env, + r#"RISEDEV_RW_META_DASHBOARD_ADDR="http://{}:{}""#, + meta_node_config.address, meta_node_config.dashboard_port + ) + .unwrap(); + } _ => {} } }