diff --git a/pdq/cpp/Makefile b/pdq/cpp/Makefile index e0918bcc7..9f313aa52 100644 --- a/pdq/cpp/Makefile +++ b/pdq/cpp/Makefile @@ -48,7 +48,6 @@ LIBSRCS=\ MAINS=\ pdq-photo-hasher \ test-mih \ - benchmark-query \ clusterize256 \ snowball-clusterize256 \ clusterize256x \ @@ -57,7 +56,9 @@ MAINS=\ pdq-downsample-demo \ test-bits \ hamming-tabulate \ - test-hamming + test-hamming \ + benchmark-query \ + benchmark-photo-hasher WASMMAINS=pdq-photo-hasher @@ -114,8 +115,6 @@ pdq-downsample-demo: bin/pdq-downsample-demo.cpp $(LIBSRCS) $(LIBHDRS) test-mih: bin/test-mih.cpp $(LIBSRCS) $(LIBHDRS) $(CCOPT) bin/test-mih.cpp $(LIBSRCS) -o test-mih $(LFLAGS) -benchmark-query: bin/benchmark-query.cpp $(LIBSRCS) $(LIBHDRS) - $(CCOPT) bin/benchmark-query.cpp $(LIBSRCS) -o benchmark-query $(LFLAGS) test-mihg: bin/test-mih.cpp $(LIBSRCS) $(LIBHDRS) $(CCDBG) bin/test-mih.cpp $(LIBSRCS) -o test-mihg $(LFLAGS) clusterize256: bin/clusterize256.cpp $(LIBSRCS) $(LIBHDRS) @@ -131,6 +130,11 @@ clusterize256x-prof: bin/clusterize256x.cpp $(LIBSRCS) $(LIBHDRS) mih-query: bin/mih-query.cpp $(LIBSRCS) $(LIBHDRS) $(CCOPT) bin/mih-query.cpp $(LIBSRCS) -o mih-query $(LFLAGS) +benchmark-query: bin/benchmark-query.cpp $(LIBSRCS) $(LIBHDRS) + $(CCOPT) bin/benchmark-query.cpp $(LIBSRCS) -o benchmark-query $(LFLAGS) +benchmark-photo-hasher: bin/benchmark-photo-hasher.cpp $(LIBSRCS) $(LIBHDRS) + $(CCOPT) bin/benchmark-photo-hasher.cpp $(LIBSRCS) -o benchmark-photo-hasher $(LFLAGS) + # Profile version. Usage: # * make pdq-photo-hasher-prof # * pdq-photo-hasher-prof {arguments> diff --git a/pdq/cpp/bin/benchmark-photo-hasher.cpp b/pdq/cpp/bin/benchmark-photo-hasher.cpp new file mode 100644 index 000000000..de12e6279 --- /dev/null +++ b/pdq/cpp/bin/benchmark-photo-hasher.cpp @@ -0,0 +1,177 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include + +#include + +using namespace facebook::pdq::hashing; + +// ================================================================ +// Static function declarations +static void usage(char* argv0, int rc); +static void hash(char* argv0, int argc, char** argv); + +// ---------------------------------------------------------------- +int main(int argc, char** argv) { + if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { + usage(argv[0], 0); + } else { + hash(argv[0], argc - 1, argv + 1); + } + return 0; +} + +// ---------------------------------------------------------------- +static void usage(char* argv0, int rc) { + FILE* fp = (rc == 0) ? stdout : stderr; + fprintf(fp, "Usage: %s [options] folder_path\n", argv0); + fprintf(fp, "Options:\n"); + fprintf(fp, " -v Verbose output\n"); + fprintf( + fp, + " -n N Total number of hashes to generate, can be more or less than the number of images in the folder\n"); + fprintf( + fp, + " (default: 0, meaning generate one hash for each image in the folder)\n"); + fprintf( + fp, + " --dihedral Compute dihedral versions of the hashes (default: false)\n"); + exit(rc); +} + +static void hash(char* argv0, int argc, char** argv) { + std::string folderPath; + int numHashes = 0; + bool verbose = false; + bool dihedral = false; + + // Parse command line arguments + for (int i = 0; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-v") { + verbose = true; + } else if (arg == "-n") { + if (i + 1 < argc) { + numHashes = std::stoi(argv[++i]); + } else { + fprintf(stderr, "Error: Missing argument for -n\n"); + usage(argv0, 1); + return; + } + } else if (arg == "--dihedral") { + dihedral = true; + } else if (arg == "-h" || arg == "--help") { + usage(argv0, 0); + return; + } else if (i == argc - 1) { + folderPath = arg; + } else { + fprintf(stderr, "Unknown argument: %s\n", arg.c_str()); + usage(argv0, 1); + return; + } + } + + // Generate hashes + std::vector hashes; + float totalReadSeconds = 0, totalHashSeconds = 0; + int numErrors = 0, numSuccesses = 0; + DIR* dir = opendir(folderPath.c_str()); + if (dir == NULL) { + perror("opendir"); + return; + } + struct dirent* ent; + while (true) { + while ((ent = readdir(dir)) != NULL) { + std::string filePath = folderPath + "/" + ent->d_name; + if (ent->d_type == DT_REG) { + facebook::pdq::hashing::Hash256 hash; + int quality; + int imageHeightTimesWidth; + float readSeconds; + float hashSeconds; + const char* filename = filePath.c_str(); + bool success; + if (dihedral) { + facebook::pdq::hashing::Hash256 hashRotate90, hashRotate180, + hashRotate270, hashFlipX, hashFlipY, hashFlipPlus1, + hashFlipMinus1; + success = facebook::pdq::hashing::pdqDihedralHash256esFromFile( + filename, + &hash, + &hashRotate90, + &hashRotate180, + &hashRotate270, + &hashFlipX, + &hashFlipY, + &hashFlipPlus1, + &hashFlipMinus1, + quality, + imageHeightTimesWidth, + readSeconds, + hashSeconds); + } else { + success = facebook::pdq::hashing::pdqHash256FromFile( + filename, + hash, + quality, + imageHeightTimesWidth, + readSeconds, + hashSeconds); + } + if (!success) { + numErrors++; + fprintf(stderr, "Error reading file: %s\n", filename); + continue; + } + if (verbose) { + printf("File: %s\n", filename); + printf("Hash: %s\n", hash.format().c_str()); + printf("Quality: %d\n", quality); + printf("Image height * width: %d\n", imageHeightTimesWidth); + printf("Read seconds: %.6lf\n", readSeconds); + printf("Hash seconds: %.6lf\n", hashSeconds); + printf("\n"); + } + hashes.push_back(hash); + totalReadSeconds += readSeconds; + totalHashSeconds += hashSeconds; + numSuccesses++; + if (numSuccesses == numHashes) + break; + } + } + if (numSuccesses == 0) { + fprintf(stderr, "No images found in folder: %s\n", folderPath.c_str()); + return; + } + if (numHashes == 0 || numSuccesses == numHashes) + break; + closedir(dir); + dir = opendir(folderPath.c_str()); + } + + printf("PHOTO COUNT: %d\n", (int)hashes.size()); + if (dihedral) { + printf("TOTAL DIHEDRAL HASHES (8/PHOTO): %d\n", (int)hashes.size() * 8); + } + printf("ERROR COUNT: %d\n", numErrors); + printf("TIME SPENT HASHING PHOTOS (SECONDS): %.6lf\n", totalHashSeconds); + double photosHashedPerSecond = + totalHashSeconds > 0 ? numSuccesses / totalHashSeconds : 0; + printf("PHOTOS HASHED PER SECOND: %.6lf\n", photosHashedPerSecond); + + printf( + "TIME SPENT READING PHOTOS (SECONDS): %.6lf\n", totalReadSeconds); + double photosReadPerSecond = + totalReadSeconds > 0 ? numSuccesses / totalReadSeconds : 0; + printf("PHOTOS READ PER SECOND: %.6lf\n", photosReadPerSecond); + printf("\n"); +} \ No newline at end of file diff --git a/pdq/cpp/hashing/README.md b/pdq/cpp/hashing/README.md new file mode 100644 index 000000000..6b9dfec26 --- /dev/null +++ b/pdq/cpp/hashing/README.md @@ -0,0 +1,59 @@ +# Benchmark for hashing +Benchmark for PDQ hashing on images + +## Instructions +``` +cd pdq/cpp +make bin/benchmark-photo-hasher +./benchmark-photo-hasher +``` + +Help command: +``` +$ ./benchmark-query -h +Usage: ./benchmark-photo-hasher [options] folder_path +Options: + -v Verbose output + -n N Total number of hashes to generate, can be more or less than the number of images in the folder + (default: 0, meaning generate one hash for each image in the folder) + --dihedral Compute dihedral versions of the hashes (default: false) +``` + +Note: if `-n N` is larger than the number of images in the folder, the benchmark will loop over each image in the folder until `N` hashes have been generated + +## Results +Ran on Ubuntu 24.04.1 LTS, Intel Core i7-14700KF with 20 cores, 28 threads, 64GB RAM + +``` +$ ./benchmark-photo-hasher ../data/reg-test-input/dih/ +PHOTO COUNT: 8 +ERROR COUNT: 0 +TIME SPENT HASHING PHOTOS (SECONDS): 0.015171 +PHOTOS HASHED PER SECOND: 527.324158 +TIME SPENT READING PHOTOS (SECONDS): 0.299410 +PHOTOS READ PER SECOND: 26.719229 + +$ ./benchmark-photo-hasher -n 10 ../data/reg-test-input/dih/ +PHOTO COUNT: 10 +ERROR COUNT: 0 +TIME SPENT HASHING PHOTOS (SECONDS): 0.018181 +PHOTOS HASHED PER SECOND: 550.028442 +TIME SPENT READING PHOTOS (SECONDS): 0.334852 +PHOTOS READ PER SECOND: 29.863914 + +$ ./benchmark-photo-hasher -n 100 ../data/reg-test-input/dih/ +PHOTO COUNT: 100 +ERROR COUNT: 0 +TIME SPENT HASHING PHOTOS (SECONDS): 0.182545 +PHOTOS HASHED PER SECOND: 547.810364 +TIME SPENT READING PHOTOS (SECONDS): 2.841218 +PHOTOS READ PER SECOND: 35.196167 + +$ ./benchmark-photo-hasher -n 1000 ../data/reg-test-input/dih/ +PHOTO COUNT: 1000 +ERROR COUNT: 0 +TIME SPENT HASHING PHOTOS (SECONDS): 1.767847 +PHOTOS HASHED PER SECOND: 565.659729 +TIME SPENT READING PHOTOS (SECONDS): 27.135609 +PHOTOS READ PER SECOND: 36.851948 +``` \ No newline at end of file