Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add actor info to diagnose & dump diagnose on CI failure #19787

Merged
merged 9 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,19 @@ script = '''
watch -n 1 "${TMUX} list-windows -t risedev | grep -v active | cut -d'(' -f1"
'''

[tasks.diagnose]
category = "Misc"
description = "Dump diagnose info"
dependencies = ["check-and-load-risedev-env-file"]
script = '''
#!/usr/bin/env bash
set -e
file_name=${PREFIX_LOG}/diagnose-$(date -u +%Y-%m-%dT%H:%M:%SZ).txt
curl -s ${RISEDEV_RW_META_DASHBOARD_ADDR}/api/monitor/diagnose/ > ${file_name}
echo "Diagnose info has been dumped to ${file_name}"
'''


[tasks.del]
alias = "delete"

Expand Down Expand Up @@ -1327,7 +1340,7 @@ echo "All processes has exited."

[tasks.slt]
category = "RiseDev - Test - SQLLogicTest"
install_crate = { min_version = "0.21.0", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [
install_crate = { min_version = "0.23.1", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [
"--help",
], install_command = "binstall" }
dependencies = ["check-and-load-risedev-env-file"]
Expand Down
2 changes: 1 addition & 1 deletion ci/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash
RUN cargo binstall -y --locked --no-symlinks cargo-llvm-cov cargo-nextest cargo-hakari cargo-sort cargo-cache cargo-audit \
[email protected] \
sqllogictest-bin@0.21.0 \
sqllogictest-bin@0.23.1 \
[email protected] \
&& cargo cache -a \
&& rm -rf "/root/.cargo/registry/index" \
Expand Down
2 changes: 1 addition & 1 deletion ci/build-ci-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ cat ../rust-toolchain
# shellcheck disable=SC2155

# REMEMBER TO ALSO UPDATE ci/docker-compose.yml
export BUILD_ENV_VERSION=v20241030
export BUILD_ENV_VERSION=v20241213

export BUILD_TAG="public.ecr.aws/w1p7b4n3/rw-build-env:${BUILD_ENV_VERSION}"

Expand Down
12 changes: 6 additions & 6 deletions ci/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ services:
retries: 5

source-test-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
- mysql
- sqlserver-server
Expand All @@ -84,7 +84,7 @@ services:
- ..:/risingwave

sink-test-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
- mysql
- db
Expand All @@ -106,7 +106,7 @@ services:
- ..:/risingwave

rw-build-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
volumes:
- ..:/risingwave

Expand All @@ -119,14 +119,14 @@ services:
- ..:/risingwave

iceberg-engine-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
- db
volumes:
- ..:/risingwave

ci-flamegraph-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
# NOTE(kwannoel): This is used in order to permit
# syscalls for `nperf` (perf_event_open),
# so it can do CPU profiling.
Expand All @@ -137,7 +137,7 @@ services:
- ..:/risingwave

regress-test-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
db:
condition: service_healthy
Expand Down
8 changes: 7 additions & 1 deletion ci/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ export RW_SECRET_STORE_PRIVATE_KEY_HEX="0123456789abcdef0123456789abcdef"
export RUST_MIN_STACK=4194304

unset LANG

function dump_diagnose_info() {
./risedev diagnose || true
}
trap dump_diagnose_info EXIT

if [ -n "${BUILDKITE_COMMIT:-}" ]; then
export GIT_SHA=$BUILDKITE_COMMIT
fi
Expand Down Expand Up @@ -148,4 +154,4 @@ check_link_info() {
echo "libssl should not be dynamically linked"
exit 1
fi
}
}
4 changes: 2 additions & 2 deletions ci/scripts/e2e-source-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ apt-get -y install jq
echo "--- e2e, inline test"
RUST_LOG="debug,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info,risingwave_meta=info" \
risedev ci-start ci-inline-source-test
risedev slt './e2e_test/source_inline/**/*.slt' -j16
risedev slt './e2e_test/source_inline/**/*.slt' --keep-db-on-failure -j16
risedev slt './e2e_test/source_inline/**/*.slt.serial'
echo "--- Kill cluster"
risedev ci-kill
Expand Down Expand Up @@ -172,4 +172,4 @@ sleep 20
risedev slt "e2e_test/webhook/webhook_source_recovery.slt"

risedev ci-kill
echo "--- cluster killed "
echo "--- cluster killed "
4 changes: 2 additions & 2 deletions ci/scripts/e2e-test-parallel-for-opendal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ host_args=(-h localhost -p 4565 -h localhost -p 4566 -h localhost -p 4567)
echo "--- e2e, ci-3cn-3fe-opendal-fs-backend, streaming"
RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
risedev ci-start ci-3cn-3fe-opendal-fs-backend
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-opendal-fs-backend-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-opendal-fs-backend-${profile}" --label "parallel"

echo "--- Kill cluster Streaming"
risedev ci-kill
Expand All @@ -42,7 +42,7 @@ echo "--- e2e, ci-3cn-3fe-opendal-fs-backend, batch"
RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
risedev ci-start ci-3cn-3fe-opendal-fs-backend
sqllogictest "${host_args[@]}" -d dev './e2e_test/ddl/**/*.slt' --junit "parallel-opendal-fs-backend-ddl-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' -j 16 --junit "parallel-opendal-fs-backend-batch-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' --keep-db-on-failure -j 16 --junit "parallel-opendal-fs-backend-batch-${profile}" --label "parallel"

echo "--- Kill cluster Batch"
risedev ci-kill
Expand Down
4 changes: 2 additions & 2 deletions ci/scripts/e2e-test-parallel-in-memory.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ host_args=(-h localhost -p 4565 -h localhost -p 4566 -h localhost -p 4567)
echo "--- e2e, ci-3cn-3fe-in-memory, streaming"
risedev ci-start ci-3cn-3fe-in-memory
sqllogictest --version
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-in-memory-streaming-${profile}" --label "in-memory" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-in-memory-streaming-${profile}" --label "in-memory" --label "parallel"

echo "--- Kill cluster"
risedev ci-kill

echo "--- e2e, ci-3cn-3fe-in-memory, batch"
risedev ci-start ci-3cn-3fe-in-memory
sqllogictest "${host_args[@]}" -d dev './e2e_test/ddl/**/*.slt' --junit "parallel-in-memory-batch-ddl-${profile}" --label "in-memory" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/batch/**/*.slt' -j 16 --junit "parallel-in-memory-batch-${profile}" --label "in-memory" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/batch/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-in-memory-batch-${profile}" --label "in-memory" --label "parallel"

echo "--- Kill cluster"
risedev ci-kill
6 changes: 3 additions & 3 deletions ci/scripts/e2e-test-parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=i
echo "--- e2e, ci-3streaming-2serving-3fe, streaming"
RUST_LOG=$RUST_LOG \
risedev ci-start ci-3streaming-2serving-3fe
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-streaming-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-streaming-${profile}" --label "parallel"

kill_cluster

Expand All @@ -47,13 +47,13 @@ RUST_LOG=$RUST_LOG \
risedev ci-start ci-3streaming-2serving-3fe
# Exclude files that contain ALTER SYSTEM commands
find ./e2e_test/ddl -name "*.slt" -type f -exec grep -L "ALTER SYSTEM" {} \; | xargs -r sqllogictest "${host_args[@]}" -d dev --junit "parallel-batch-ddl-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' -j 16 --junit "parallel-batch-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' --keep-db-on-failure -j 16 --junit "parallel-batch-${profile}" --label "parallel"

kill_cluster

echo "--- e2e, ci-3streaming-2serving-3fe, generated"
RUST_LOG=$RUST_LOG \
risedev ci-start ci-3streaming-2serving-3fe
sqllogictest "${host_args[@]}" -d dev './e2e_test/generated/**/*.slt' -j 16 --junit "parallel-generated-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/generated/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-generated-${profile}" --label "parallel"

kill_cluster
2 changes: 1 addition & 1 deletion e2e_test/source_inline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Other tests can be run in parallel.

```bash
# run all parallel tests
risedev slt './e2e_test/source_inline/**/*.slt' -j16
risedev slt './e2e_test/source_inline/**/*.slt' --keep-db-on-failure -j16
# run all serial tests
risedev slt './e2e_test/source_inline/**/*.slt.serial'
```
Expand Down
4 changes: 2 additions & 2 deletions e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,5 @@ SELECT COUNT(*), MAX(age), MIN(age), SUM(age) FROM mv_user_more;
30 104 0 1020


# statement ok
# DROP SOURCE src_user CASCADE;
statement ok
DROP SOURCE src_user CASCADE;
22 changes: 21 additions & 1 deletion src/meta/src/controller/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ use risingwave_meta_model::prelude::{Actor, Fragment, Sink, StreamingJob};
use risingwave_meta_model::{
actor, actor_dispatcher, fragment, object, sink, source, streaming_job, table, ActorId,
ActorUpstreamActors, ConnectorSplits, DatabaseId, ExprContext, FragmentId, I32Array, JobStatus,
ObjectId, SinkId, SourceId, StreamNode, StreamingParallelism, TableId, VnodeBitmap, WorkerId,
ObjectId, SchemaId, SinkId, SourceId, StreamNode, StreamingParallelism, TableId, VnodeBitmap,
WorkerId,
};
use risingwave_meta_model_migration::{Alias, SelectStatement};
use risingwave_pb::common::PbActorLocation;
Expand Down Expand Up @@ -882,6 +883,25 @@ impl CatalogController {
Ok(actor_locations)
}

pub async fn list_actor_info(
&self,
) -> MetaResult<Vec<(ActorId, FragmentId, ObjectId, SchemaId, ObjectType)>> {
let inner = self.inner.read().await;
let actor_locations: Vec<(ActorId, FragmentId, ObjectId, SchemaId, ObjectType)> =
Actor::find()
.join(JoinType::LeftJoin, actor::Relation::Fragment.def())
.join(JoinType::LeftJoin, fragment::Relation::Object.def())
.select_only()
.columns([actor::Column::ActorId, actor::Column::FragmentId])
.column_as(object::Column::Oid, "job_id")
.column_as(object::Column::SchemaId, "schema_id")
.column_as(object::Column::ObjType, "type")
.into_tuple()
.all(&inner.db)
.await?;
Ok(actor_locations)
}

pub async fn list_source_actors(&self) -> MetaResult<Vec<(ActorId, FragmentId)>> {
let inner = self.inner.read().await;

Expand Down
53 changes: 52 additions & 1 deletion src/meta/src/manager/diagnose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// limitations under the License.

use std::cmp::{Ordering, Reverse};
use std::collections::{BTreeMap, BinaryHeap};
use std::collections::{BTreeMap, BinaryHeap, HashMap};
use std::fmt::Write;
use std::sync::Arc;

Expand Down Expand Up @@ -677,6 +677,7 @@ impl DiagnoseCommand {
("INDEX", indexes),
("SINK", sinks),
];
let mut obj_id_to_name = HashMap::new();
for (title, items) in catalogs {
use comfy_table::{Row, Table};
let mut table = Table::new();
Expand All @@ -689,6 +690,7 @@ impl DiagnoseCommand {
row
});
for (id, (name, schema_id, definition)) in items {
obj_id_to_name.insert(id, name.clone());
let mut row = Row::new();
let may_redact =
redact_all_sql_options(&definition).unwrap_or_else(|| "[REDACTED]".into());
Expand All @@ -702,6 +704,55 @@ impl DiagnoseCommand {
let _ = writeln!(s, "{title}");
let _ = writeln!(s, "{table}");
}

let actors = self
.metadata_manager
.catalog_controller
.list_actor_info()
.await?
.into_iter()
.map(|(actor_id, fragment_id, job_id, schema_id, obj_type)| {
(
actor_id,
(
fragment_id,
job_id,
schema_id,
obj_type,
obj_id_to_name
.get(&(job_id as _))
.cloned()
.unwrap_or_default(),
),
)
})
.collect::<BTreeMap<_, _>>();

use comfy_table::{Row, Table};
let mut table = Table::new();
table.set_header({
let mut row = Row::new();
row.add_cell("id".into());
row.add_cell("fragment_id".into());
row.add_cell("job_id".into());
row.add_cell("schema_id".into());
row.add_cell("type".into());
row.add_cell("name".into());
row
});
for (actor_id, (fragment_id, job_id, schema_id, ddl_type, name)) in actors {
let mut row = Row::new();
row.add_cell(actor_id.into());
row.add_cell(fragment_id.into());
row.add_cell(job_id.into());
row.add_cell(schema_id.into());
row.add_cell(ddl_type.as_str().into());
row.add_cell(name.into());
table.add_row(row);
}
let _ = writeln!(s);
let _ = writeln!(s, "ACTOR");
let _ = writeln!(s, "{table}");
Ok(())
}
}
Expand Down
8 changes: 8 additions & 0 deletions src/risedevtool/src/risedev_env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,14 @@ pub fn generate_risedev_env(services: &Vec<ServiceConfig>) -> String {
)
.unwrap();
}
ServiceConfig::MetaNode(meta_node_config) => {
writeln!(
env,
r#"RISEDEV_RW_META_DASHBOARD_ADDR="http://{}:{}""#,
meta_node_config.address, meta_node_config.dashboard_port
)
.unwrap();
}
_ => {}
}
}
Expand Down
Loading