From b264719462d8e2038f503b6891f0fc78c064767c Mon Sep 17 00:00:00 2001 From: Shaharuk Shaikh <56402576+shaharuk-yb@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:14:09 +0530 Subject: [PATCH] perf-data-loader: db load order and fixed utility function mapping for text/string PK data types (#142) * using PrimaryStringGen for text/string datatype with primary key * added subcommand to generate load order of the entire table --- PERF-DATALOADER.md | 10 +- perf-data-loader | 37 ++- .../dataloader/DataGeneratorLoader.java | 224 ++++++++++++------ .../perf-dataloader/pk-mapping.properties | 6 +- 4 files changed, 196 insertions(+), 81 deletions(-) diff --git a/PERF-DATALOADER.md b/PERF-DATALOADER.md index 4f5126084..549f31a5a 100644 --- a/PERF-DATALOADER.md +++ b/PERF-DATALOADER.md @@ -18,7 +18,7 @@ cd benchbase ### How to use: ``` #$./perf-data-loader --help -Usage: ./perf-data-loader --config --table-name --rows [--gen-config-only] [--load-only] +Usage: ./perf-data-loader --config --table-name --rows [--gen-config-only] [--load-only] [--gen-load-order] Short forms: -c -t -r Options: -c, --config Configuration file @@ -26,6 +26,7 @@ Options: -r, --rows Number of rows --gen-config-only Only generate the loader/config file --load-only Only load data into the database + --gen-load-order Generate table load order in the provided database -h, --help Display this help message ``` - to only generate the loader file(skip the actual load). This will generate the yaml file _loader.yaml which can be used in loading the data. @@ -42,6 +43,13 @@ Options: ``` ./perf-data-loader --config --table-name --rows ``` +- to generate the load order of all the tables in the database. It is useful to know the load order when there are +foreign ey references in the db schema. When used, this command will print out the load order/levels in which you can +populate the tables. Start from tables at level 0, then level 1, and so on. It also generate load_order.json file +containing the load order in json format. +```shell +./perf-data-loader -c --gen-load-order +``` the input yaml file should have following content ``` diff --git a/perf-data-loader b/perf-data-loader index 3816fda51..5d1970c47 100755 --- a/perf-data-loader +++ b/perf-data-loader @@ -6,11 +6,12 @@ TABLE_NAME="" ROWS="" GENERATE_ONLY=false LOAD_ONLY=false +GEN_LOAD_ORDER=false JAR_PATH="" # Function to display help function display_help { - echo "Usage: $0 --config --table-name --rows [--gen-config-only] [--load-only]" + echo "Usage: $0 --config --table-name --rows [--gen-config-only] [--load-only] [--gen-load-order]" echo "Short forms: -c -t -r " echo "Options:" echo " -c, --config Configuration file" @@ -18,6 +19,7 @@ function display_help { echo " -r, --rows Number of rows" echo " --gen-config-only Only generate the loader/config file" echo " --load-only Only load data into the database" + echo " --gen-load-order Generate table load order in the provided database" echo " -h, --help Display this help message" exit 0 } @@ -72,6 +74,10 @@ while [[ $# -gt 0 ]]; do LOAD_ONLY=true shift # past argument ;; + --gen-load-order) + GEN_LOAD_ORDER=true + shift # past argument + ;; -h|--help) display_help ;; @@ -83,8 +89,10 @@ while [[ $# -gt 0 ]]; do done # Ensure that both --gen-config-only and --load-only are not provided simultaneously -if [ "$GENERATE_ONLY" = true ] && [ "$LOAD_ONLY" = true ]; then - echo "Error: Cannot use --gen-config-only and --load-only simultaneously." +if { [ "$GENERATE_ONLY" = true ] && [ "$LOAD_ONLY" = true ]; } || \ + { [ "$GENERATE_ONLY" = true ] && [ "$GEN_LOAD_ORDER" = true ]; } || \ + { [ "$LOAD_ONLY" = true ] && [ "$GEN_LOAD_ORDER" = true ]; }; then + echo "Error: Cannot use these conflicting parameters simultaneously. --gen-config-only, --load-only, --gen-load-order ." exit 1 fi @@ -111,6 +119,29 @@ if [ "$GENERATE_ONLY" = true ]; then exit 0 fi +if [ "$GEN_LOAD_ORDER" = true ]; then + if [ -z "$CONFIG" ]; then + echo "Error: --config parameter is required with --gen-load-order." + display_help + fi + # Create a temporary file + TEMP_CONFIG=$(mktemp) + # Copy the content of the original CONFIG file to the temporary file + cp "$CONFIG" "$TEMP_CONFIG" + + # Append text to the temporary config file (you can customize the text as needed) + echo >> "$TEMP_CONFIG" + echo "gen-db-load-order: true" >> "$TEMP_CONFIG" + echo "Generating load order based on the provided config file" + java -jar "$JAR_PATH" -b perf-dataloader -c "$TEMP_CONFIG" -p tableName="dummy" -p rows="1" --load=True + + # Clean up: delete the temporary config file + rm -f "$TEMP_CONFIG" + + exit 0 +fi + + # Check if required parameters are provided for the default operation if [ -z "$CONFIG" ] || [ -z "$TABLE_NAME" ] || [ -z "$ROWS" ]; then echo "Error: Missing required parameters." diff --git a/src/main/java/com/oltpbenchmark/benchmarks/dataloader/DataGeneratorLoader.java b/src/main/java/com/oltpbenchmark/benchmarks/dataloader/DataGeneratorLoader.java index fbff23d74..fea1a78c8 100644 --- a/src/main/java/com/oltpbenchmark/benchmarks/dataloader/DataGeneratorLoader.java +++ b/src/main/java/com/oltpbenchmark/benchmarks/dataloader/DataGeneratorLoader.java @@ -3,6 +3,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.oltpbenchmark.api.Loader; import com.oltpbenchmark.api.LoaderThread; +import com.oltpbenchmark.util.JSONUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.yaml.snakeyaml.DumperOptions; @@ -275,12 +276,6 @@ public static List getForeignKeys(String tableName, Connection conn) return foreignKeyList; } - public static List getParentTableHierarchy(String tableName, Connection conn) { - List tableHierarchy = new ArrayList<>(); - findParentTables(tableName, conn, tableHierarchy); - return tableHierarchy; - } - public static void buildDependencyDAGForTable(Map> graph, List foreignKeyList, Connection conn) { // Build the adjacency list for (ForeignKey fk : foreignKeyList) { @@ -292,83 +287,116 @@ public static void buildDependencyDAGForTable(Map> grap } } - private static void findParentTables(String tableName, Connection conn, List tableHierarchy) { - List foreignKeys = getForeignKeys(tableName, conn); - - if (foreignKeys.isEmpty()) { - tableHierarchy.add(tableName); - } else { - for (ForeignKey foreignKey : foreignKeys) { - findParentTables(foreignKey.getForeignTableName(), conn, tableHierarchy); - } - tableHierarchy.add(tableName); - } - } - @Override public List createLoaderThreads() throws SQLException { Connection conn = benchmark.makeConnection(); - String tableName = workConf.getXmlConfig().getString("tablename"); - int rows = workConf.getXmlConfig().getInt("rows"); + boolean genLoadOrderOnly = workConf.getXmlConfig().getBoolean("gen-db-load-order", false); + if (genLoadOrderOnly) { + Set processedTables = new HashSet<>(); + Map> graph = new HashMap<>(); + Set visited = new HashSet<>(); + StringBuilder loadOrder = new StringBuilder(); + Map> depth = new TreeMap<>(); + Map> levelAndTables = new LinkedHashMap<>(); + buildGraph(conn, processedTables, graph); + + List independentTables = new ArrayList<>(); + List allDbTables = getAllTables(conn); + for(String table: allDbTables) { + if (!processedTables.contains(table.toLowerCase())) { + independentTables.add(table); + } + } - // check if the table exists in the database - checkIfTableExists(tableName, conn); - // get the table schema - List tableSchema = getTableSchema(tableName, conn); + if (!graph.isEmpty()) { + String startTable = graph.keySet().iterator().next(); + levelAndTables.putAll(getOrderOfImport(startTable, loadOrder, graph, depth, visited)); + levelAndTables.get(0).addAll(independentTables); + } else { + levelAndTables.put(0, independentTables); + } - // key primary key details - List primaryKeys = getPrimaryKeys(tableName, conn); + int totalTables = 0; + for (Map.Entry> entry : levelAndTables.entrySet()) { + int level = entry.getKey(); + List tablesAtLevel = entry.getValue(); + totalTables += tablesAtLevel.size(); + System.out.println("Level " + level + ": " + String.join(", ", tablesAtLevel)); + } - // get all unique constraints from the indexes - List uniqueConstraintColumns = getUniqueConstrains(tableName, conn); + System.out.println("Total number of tables: " + totalTables); + try { + FileWriter writer = new FileWriter("load_order.json"); + writer.write(JSONUtil.format(JSONUtil.toJSONString(levelAndTables))); + writer.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + } else { + String tableName = workConf.getXmlConfig().getString("tablename"); + int rows = workConf.getXmlConfig().getInt("rows"); + + // check if the table exists in the database + checkIfTableExists(tableName, conn); + // get the table schema + List tableSchema = getTableSchema(tableName, conn); + + // key primary key details + List primaryKeys = getPrimaryKeys(tableName, conn); + + // get all unique constraints from the indexes + List uniqueConstraintColumns = getUniqueConstrains(tableName, conn); - // get all columns with respective user defined ENUM data type - Map> udColumns = getUserDefinedEnumDataTypes(tableName, "public", conn); + // get all columns with respective user defined ENUM data type + Map> udColumns = getUserDefinedEnumDataTypes(tableName, "public", conn); - // get all foreign keys of the table - List foreignKeys = getForeignKeys(tableName, conn); + // get all foreign keys of the table + List foreignKeys = getForeignKeys(tableName, conn); // System.out.println(foreignKeys); - int limit = Math.min(10000, rows); - List fkColNames = new ArrayList<>(); + int limit = Math.min(10000, rows); + List fkColNames = new ArrayList<>(); - // if in foreign key, parent table is same as current table, don't treat it as foreign key. treat is as normal column - List fkToRemove = new ArrayList<>(); - foreignKeys.forEach(fk -> { - if (fk.getForeignTableName().equalsIgnoreCase(tableName)) { - fkToRemove.add(fk); - } else { - fkColNames.add(fk.getColumnName()); - } - }); - foreignKeys.removeAll(fkToRemove); + // if in foreign key, parent table is same as current table, don't treat it as foreign key. treat is as normal column + List fkToRemove = new ArrayList<>(); + foreignKeys.forEach(fk -> { + if (fk.getForeignTableName().equalsIgnoreCase(tableName)) { + fkToRemove.add(fk); + } else { + fkColNames.add(fk.getColumnName()); + } + }); + foreignKeys.removeAll(fkToRemove); - // remove all fks from unique constraints - uniqueConstraintColumns.removeAll(fkColNames); + // remove all fks from unique constraints + uniqueConstraintColumns.removeAll(fkColNames); - // remove all fks from primary keys - List pkToRemove = new ArrayList<>(); - primaryKeys.forEach(pk -> { - if (fkColNames.contains(pk.getColumnName())) - pkToRemove.add(pk); - }); - primaryKeys.removeAll(pkToRemove); + // remove all fks from primary keys + List pkToRemove = new ArrayList<>(); + primaryKeys.forEach(pk -> { + if (fkColNames.contains(pk.getColumnName())) + pkToRemove.add(pk); + }); + primaryKeys.removeAll(pkToRemove); - if (!foreignKeys.isEmpty()) { - // fetch the distinct values from parent table. This could take some time - getDistinctValuesFromParentTable(conn, foreignKeys, limit); - } - // create mapping of utility function to the columns in the table - Map columnToUtilsMapping = - utilsMapping(tableSchema, primaryKeys, foreignKeys, limit, rows, uniqueConstraintColumns, udColumns); + if (!foreignKeys.isEmpty()) { + // fetch the distinct values from parent table. This could take some time + getDistinctValuesFromParentTable(conn, foreignKeys, limit); + } + // create mapping of utility function to the columns in the table + Map columnToUtilsMapping = + utilsMapping(tableSchema, primaryKeys, foreignKeys, limit, rows, uniqueConstraintColumns, udColumns); - // generate the mapping object which can be used to create the output yaml file - Root root = generateMappingObject(tableName, rows, columnToUtilsMapping, fkColNames, udColumns); + // generate the mapping object which can be used to create the output yaml file + Root root = generateMappingObject(tableName, rows, columnToUtilsMapping, fkColNames, udColumns); + + // create output yaml file + writeToFile(tableName, rows, root); + LOG.info("Generated loader file: {}_loader.yaml", tableName); + } - // create output yaml file - writeToFile(tableName, rows, root); - LOG.info("Generated loader file: {}_loader.yaml", tableName); return new ArrayList<>(); } @@ -522,12 +550,9 @@ public void getDistinctValuesFromParentTable(Connection conn, List f "following order/Levels(tables from `Level 0` first, then `Level 1` and so on: ", foreignKey.getForeignTableName(), foreignKey.getForeignColumnName())); - for (String table : graph.keySet()) { - if (!visited.contains(table)) { - getOrderOfImport(table, loadOrder, graph, depth, visited); - } - } - throw new RuntimeException(loadOrder.toString()); + String startTable = graph.keySet().iterator().next(); + Map> levelsAndTables = getOrderOfImport(startTable, loadOrder, graph, depth, visited); + throw new RuntimeException(loadOrder.append(generateLoadOrder(levelsAndTables)).toString()); } foreignKey.setDistinctValues(distinctValues); } catch (SQLException e) { @@ -630,9 +655,10 @@ public static class Dependency { } // Function to print nodes by levels - public void getOrderOfImport(String startTable, StringBuilder loadOrder, + public Map> getOrderOfImport(String startTable, StringBuilder loadOrder, Map> graph , Map> depth, Set visited) { + Map> levelsAndTables = new LinkedHashMap<>(); dfs(startTable, 0, graph, depth, visited); // Adjust levels to start from 0 @@ -646,9 +672,20 @@ public void getOrderOfImport(String startTable, StringBuilder loadOrder, for (Map.Entry> entry : adjustedDepth.entrySet()) { int level = entry.getKey(); List tablesAtLevel = entry.getValue(); - loadOrder.append("\n").append("Level ").append(level).append(": ").append(String.join(", ", tablesAtLevel)); + levelsAndTables.put(level, tablesAtLevel); + } + + return levelsAndTables; + } + + public StringBuilder generateLoadOrder(Map> levelAndTables) { + StringBuilder loadOrder = new StringBuilder(); + for (Map.Entry> entry : levelAndTables.entrySet()) { + int level = entry.getKey(); + loadOrder.append("\n").append("Level ").append(level).append(": ").append(String.join(", ", levelAndTables.get(level))); } + return loadOrder; } // DFS function to populate the levels map @@ -670,4 +707,43 @@ public void dfs(String table, int level, Map> graph , M } } } + + public void buildGraph(Connection conn, Set processedTables, Map> graph) { + String query = "SELECT tc.table_name AS child_table, ccu.table_name AS parent_table " + + "FROM information_schema.table_constraints AS tc " + + "JOIN information_schema.key_column_usage AS kcu " + + "ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema " + + "JOIN information_schema.constraint_column_usage AS ccu " + + "ON ccu.constraint_name = tc.constraint_name AND ccu.table_schema = tc.table_schema " + + "WHERE tc.constraint_type = 'FOREIGN KEY'"; + + try (Statement stmt = conn.createStatement(); + ResultSet rs = stmt.executeQuery(query)) { + while (rs.next()) { + String childTable = rs.getString("child_table"); + String parentTable = rs.getString("parent_table"); + + processedTables.add(parentTable.toLowerCase()); + processedTables.add(childTable.toLowerCase()); + // Build the adjacency list + graph.computeIfAbsent(parentTable, k -> new ArrayList<>()).add(new Dependency(childTable, 1)); + graph.computeIfAbsent(childTable, k -> new ArrayList<>()).add(new Dependency(parentTable, -1)); + } + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + public static List getAllTables(Connection connection) throws SQLException { + List tables = new ArrayList<>(); + DatabaseMetaData metaData = connection.getMetaData(); + String[] types = {"TABLE"}; + try (ResultSet rs = metaData.getTables(null, null, "%", types)) { + while (rs.next()) { + String tableName = rs.getString("TABLE_NAME"); + tables.add(tableName); + } + } + return tables; + } } diff --git a/src/main/resources/benchmarks/perf-dataloader/pk-mapping.properties b/src/main/resources/benchmarks/perf-dataloader/pk-mapping.properties index 749109736..8333c6c26 100644 --- a/src/main/resources/benchmarks/perf-dataloader/pk-mapping.properties +++ b/src/main/resources/benchmarks/perf-dataloader/pk-mapping.properties @@ -1,9 +1,9 @@ # lowerRange,upperRange integer=PrimaryIntGen:2:1,rows # startNumber,endNumber,desiredLength -string=RandomPKString:3:1,rows,1000 -charactervarying=RandomPKString:3:1,rows,1000 -text=RandomPKString:3:1,rows,1000 +string=PrimaryStringGen:2:1,1000 +charactervarying=PrimaryStringGen:2:1,1000 +text=PrimaryStringGen:2:1,1000 # startNumber,length string1=HashedPrimaryStringGen:2:1,5