Skip to content

Commit

Permalink
🔨 rework md5 hashing to happen in the db since mysql
Browse files Browse the repository at this point in the history
rewrites json field order an whitespace and we need
to hash in a consistent way
  • Loading branch information
danyx23 committed Aug 24, 2024
1 parent 41e423b commit 10112c8
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 39 deletions.
73 changes: 50 additions & 23 deletions adminSiteServer/apiRouter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ import {
DATA_API_URL,
} from "../settings/serverSettings.js"
import {
Base64String,
expectInt,
getMd5HashBase64,
isValidSlug,
} from "../serverUtils/serverUtil.js"
import {
Expand Down Expand Up @@ -326,23 +326,15 @@ const saveNewChart = async (
const fullConfig = mergeGrapherConfigs(fullParentConfig, patchConfig)
const fullConfigStringified = JSON.stringify(fullConfig)

// compute an MD5 hash of the full config
const fullConfigMd5 = await getMd5HashBase64(fullConfigStringified)

// insert patch & full configs into the chart_configs table
const chartConfigId = uuidv7()
await db.knexRaw(
knex,
`-- sql
INSERT INTO chart_configs (id, patch, full, fullMd5)
VALUES (?, ?, ?, ?)
INSERT INTO chart_configs (id, patch, full)
VALUES (?, ?, ?)
`,
[
chartConfigId,
JSON.stringify(patchConfig),
fullConfigStringified,
fullConfigMd5,
]
[chartConfigId, JSON.stringify(patchConfig), fullConfigStringified]
)

// add a new chart to the charts table
Expand Down Expand Up @@ -372,7 +364,23 @@ const saveNewChart = async (
[chartId, chartId, chartId]
)

await saveGrapherConfigToR2ByUUID(chartConfigId, fullConfigStringified)
// We need to get the full config and the md5 hash from the database instead of
// computing our own md5 hash because MySQL normalizes JSON and our
// client computed md5 would be different from the ones computed by and stored in R2
const fullConfigMd5 = await db.knexRawFirst<
Pick<DbRawChartConfig, "full" | "fullMd5">
>(
knex,
`-- sql
select full, fullMd5 from chart_configs where id = ?`,
[chartConfigId]
)

await saveGrapherConfigToR2ByUUID(
chartConfigId,
fullConfigMd5!.full,
fullConfigMd5!.fullMd5 as Base64String
)

return { patchConfig, fullConfig }
}
Expand Down Expand Up @@ -410,8 +418,6 @@ const updateExistingChart = async (
const fullConfig = mergeGrapherConfigs(fullParentConfig, patchConfig)
const fullConfigStringified = JSON.stringify(fullConfig)

const fullConfigMd5 = await getMd5HashBase64(fullConfigStringified)

const chartConfigId = await db.knexRawFirst<Pick<DbPlainChart, "configId">>(
knex,
`SELECT configId FROM charts WHERE id = ?`,
Expand All @@ -428,14 +434,12 @@ const updateExistingChart = async (
UPDATE chart_configs
SET
patch=?,
full=?,
fullMd5=?
full=?
WHERE id = ?
`,
[
JSON.stringify(patchConfig),
fullConfigStringified,
fullConfigMd5,
chartConfigId.configId,
]
)
Expand All @@ -451,9 +455,22 @@ const updateExistingChart = async (
[shouldInherit, new Date(), user.id, chartId]
)

// We need to get the full config and the md5 hash from the database instead of
// computing our own md5 hash because MySQL normalizes JSON and our
// client computed md5 would be different from the ones computed by and stored in R2
const fullConfigMd5 = await db.knexRawFirst<
Pick<DbRawChartConfig, "full" | "fullMd5">
>(
knex,
`-- sql
select full, fullMd5 from chart_configs where id = ?`,
[chartConfigId]
)

await saveGrapherConfigToR2ByUUID(
chartConfigId.configId,
fullConfigStringified
fullConfigMd5!.full,
fullConfigMd5!.fullMd5 as Base64String
)

return { patchConfig, fullConfig }
Expand Down Expand Up @@ -634,13 +651,23 @@ const saveGrapher = async (
)

if (fullConfig.isPublished) {
const configStringified = JSON.stringify(fullConfig)
const configMd5 = await getMd5HashBase64(configStringified)
// We need to get the full config and the md5 hash from the database instead of
// computing our own md5 hash because MySQL normalizes JSON and our
// client computed md5 would be different from the ones computed by and stored in R2
const fullConfigMd5 = await db.knexRawFirst<
Pick<DbRawChartConfig, "full" | "fullMd5">
>(
knex,
`-- sql
select full, fullMd5 from chart_configs where id = ?`,
[]
)

await saveGrapherConfigToR2(
configStringified,
fullConfigMd5!.full,
R2GrapherConfigDirectory.publishedGrapherBySlug,
`${fullConfig.slug}.json`,
configMd5
fullConfigMd5!.fullMd5 as Base64String
)
}

Expand Down
13 changes: 6 additions & 7 deletions adminSiteServer/chartConfigR2Helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import {
} from "@aws-sdk/client-s3"
import { JsonError, lazy } from "@ourworldindata/utils"
import { logErrorAndMaybeSendToBugsnag } from "../serverUtils/errorLog.js"
import { getMd5HashBase64, Base64String } from "../serverUtils/serverUtil.js"
import { Base64String } from "../serverUtils/serverUtil.js"

export enum R2GrapherConfigDirectory {
byUUID = "config/by-uuid",
Expand All @@ -37,15 +37,14 @@ const getS3Client: () => S3Client = lazy(

export async function saveGrapherConfigToR2ByUUID(
id: string,
chartConfigStringified: string
chartConfigStringified: string,
configMd5FromDb: Base64String
) {
const configMd5 = await getMd5HashBase64(chartConfigStringified)

await saveGrapherConfigToR2(
chartConfigStringified,
R2GrapherConfigDirectory.byUUID,
`${id}.json`,
configMd5
configMd5FromDb
)
}

Expand All @@ -60,7 +59,7 @@ export async function saveGrapherConfigToR2(
config_stringified: string,
directory: R2GrapherConfigDirectory,
filename: string,
configMd5: Base64String
configMd5FromDb: Base64String
) {
if (process.env.NODE_ENV === "test") {
console.log("Skipping saving grapher config to R2 in test environment")
Expand Down Expand Up @@ -94,7 +93,7 @@ export async function saveGrapherConfigToR2(
Key: path,
Body: config_stringified,
ContentType: MIMEType,
ContentMD5: configMd5,
ContentMD5: configMd5FromDb,
}

await s3Client.send(new PutObjectCommand(params))
Expand Down
2 changes: 1 addition & 1 deletion db/migration/1722415645057-AddChartConfigHash.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export class AddChartConfigHash1722415645057 implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`
ALTER TABLE chart_configs
ADD COLUMN fullMd5 CHAR(24) DEFAULT to_base64(unhex(md5(full)));
ADD COLUMN fullMd5 CHAR(24) GENERATED ALWAYS as (to_base64(unhex(md5(full)))) STORED NOT NULL;
`)
}

Expand Down
3 changes: 2 additions & 1 deletion devTools/syncGraphersToR2/syncGraphersToR2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ async function syncWithR2(
// Usage:
await listS3ObjectsAndPerformAction(s3Client, pathPrefix, (object) => {
if (object && object.Key && object.ETag) {
const md5 = object.ETag.replace(/"/g, "") as HexString
const md5 = object.ETag.replace(/(W\/)?"/g, "") as HexString
const md5Base64 = bytesToBase64(hexToBytes(md5))

if (hashesOfFilesToToUpsert.has(object.Key)) {
Expand Down Expand Up @@ -290,6 +290,7 @@ async function main(parsedArgs: parseArgs.ParsedArgs, dryRun: boolean) {
dryRun
)
})
process.exit(0)
}

const parsedArgs = parseArgs(process.argv.slice(2))
Expand Down
7 changes: 0 additions & 7 deletions serverUtils/serverUtil.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import ReactDOMServer from "react-dom/server.js"
import * as lodash from "lodash"
import { JsonError, Nominal } from "@ourworldindata/utils"
import { createHash } from "crypto"

// Fail-fast integer conversion, for e.g. ids in url params
export const expectInt = (value: any): number => {
Expand Down Expand Up @@ -37,9 +36,3 @@ export function hexToBytes(hex: string): Uint8Array {
export function bytesToHex(bytes: Uint8Array): HexString {
return Buffer.from(bytes).toString("hex") as HexString
}

export function getMd5HashBase64(data: string): Base64String {
return createHash("md5")
.update(data, "utf-8")
.digest("base64") as Base64String
}

0 comments on commit 10112c8

Please sign in to comment.