Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark for C++ PDQ Hasher #1726

Merged
merged 25 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions pdq/cpp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ LIBSRCS=\
MAINS=\
pdq-photo-hasher \
test-mih \
benchmark-query \
clusterize256 \
snowball-clusterize256 \
clusterize256x \
Expand All @@ -57,7 +56,9 @@ MAINS=\
pdq-downsample-demo \
test-bits \
hamming-tabulate \
test-hamming
test-hamming \
benchmark-query \
benchmark-photo-hasher

WASMMAINS=pdq-photo-hasher

Expand Down Expand Up @@ -114,8 +115,6 @@ pdq-downsample-demo: bin/pdq-downsample-demo.cpp $(LIBSRCS) $(LIBHDRS)

test-mih: bin/test-mih.cpp $(LIBSRCS) $(LIBHDRS)
$(CCOPT) bin/test-mih.cpp $(LIBSRCS) -o test-mih $(LFLAGS)
benchmark-query: bin/benchmark-query.cpp $(LIBSRCS) $(LIBHDRS)
$(CCOPT) bin/benchmark-query.cpp $(LIBSRCS) -o benchmark-query $(LFLAGS)
test-mihg: bin/test-mih.cpp $(LIBSRCS) $(LIBHDRS)
$(CCDBG) bin/test-mih.cpp $(LIBSRCS) -o test-mihg $(LFLAGS)
clusterize256: bin/clusterize256.cpp $(LIBSRCS) $(LIBHDRS)
Expand All @@ -131,6 +130,11 @@ clusterize256x-prof: bin/clusterize256x.cpp $(LIBSRCS) $(LIBHDRS)
mih-query: bin/mih-query.cpp $(LIBSRCS) $(LIBHDRS)
$(CCOPT) bin/mih-query.cpp $(LIBSRCS) -o mih-query $(LFLAGS)

benchmark-query: bin/benchmark-query.cpp $(LIBSRCS) $(LIBHDRS)
$(CCOPT) bin/benchmark-query.cpp $(LIBSRCS) -o benchmark-query $(LFLAGS)
benchmark-photo-hasher: bin/benchmark-photo-hasher.cpp $(LIBSRCS) $(LIBHDRS)
$(CCOPT) bin/benchmark-photo-hasher.cpp $(LIBSRCS) -o benchmark-photo-hasher $(LFLAGS)

# Profile version. Usage:
# * make pdq-photo-hasher-prof
# * pdq-photo-hasher-prof {arguments>
Expand Down
177 changes: 177 additions & 0 deletions pdq/cpp/bin/benchmark-photo-hasher.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pdq/cpp/io/hashio.h>
#include <pdq/cpp/io/pdqio.h>

#include <chrono>

using namespace facebook::pdq::hashing;

// ================================================================
// Static function declarations
static void usage(char* argv0, int rc);
static void hash(char* argv0, int argc, char** argv);

// ----------------------------------------------------------------
int main(int argc, char** argv) {
if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) {
usage(argv[0], 0);
} else {
hash(argv[0], argc - 1, argv + 1);
}
return 0;
}

// ----------------------------------------------------------------
static void usage(char* argv0, int rc) {
FILE* fp = (rc == 0) ? stdout : stderr;
fprintf(fp, "Usage: %s [options] folder_path\n", argv0);
fprintf(fp, "Options:\n");
fprintf(fp, " -v Verbose output\n");
fprintf(
fp,
" -n N Total number of hashes to generate, can be more or less than the number of images in the folder\n");
fprintf(
fp,
" (default: 0, meaning generate one hash for each image in the folder)\n");
fprintf(
fp,
" --dihedral Compute dihedral versions of the hashes (default: false)\n");
exit(rc);
}

static void hash(char* argv0, int argc, char** argv) {
std::string folderPath;
int numHashes = 0;
bool verbose = false;
bool dihedral = false;

// Parse command line arguments
for (int i = 0; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-v") {
verbose = true;
} else if (arg == "-n") {
if (i + 1 < argc) {
numHashes = std::stoi(argv[++i]);
} else {
fprintf(stderr, "Error: Missing argument for -n\n");
usage(argv0, 1);
return;
}
} else if (arg == "--dihedral") {
dihedral = true;
} else if (arg == "-h" || arg == "--help") {
usage(argv0, 0);
return;
} else if (i == argc - 1) {
folderPath = arg;
} else {
fprintf(stderr, "Unknown argument: %s\n", arg.c_str());
usage(argv0, 1);
return;
}
}

// Generate hashes
std::vector<facebook::pdq::hashing::Hash256> hashes;
float totalReadSeconds = 0, totalHashSeconds = 0;
int numErrors = 0, numSuccesses = 0;
DIR* dir = opendir(folderPath.c_str());
if (dir == NULL) {
perror("opendir");
return;
}
struct dirent* ent;
while (true) {
while ((ent = readdir(dir)) != NULL) {
std::string filePath = folderPath + "/" + ent->d_name;
if (ent->d_type == DT_REG) {
facebook::pdq::hashing::Hash256 hash;
int quality;
int imageHeightTimesWidth;
float readSeconds;
float hashSeconds;
const char* filename = filePath.c_str();
bool success;
if (dihedral) {
facebook::pdq::hashing::Hash256 hashRotate90, hashRotate180,
hashRotate270, hashFlipX, hashFlipY, hashFlipPlus1,
hashFlipMinus1;
success = facebook::pdq::hashing::pdqDihedralHash256esFromFile(
filename,
&hash,
&hashRotate90,
&hashRotate180,
&hashRotate270,
&hashFlipX,
&hashFlipY,
&hashFlipPlus1,
&hashFlipMinus1,
quality,
imageHeightTimesWidth,
readSeconds,
hashSeconds);
} else {
success = facebook::pdq::hashing::pdqHash256FromFile(
filename,
hash,
quality,
imageHeightTimesWidth,
readSeconds,
hashSeconds);
}
if (!success) {
numErrors++;
fprintf(stderr, "Error reading file: %s\n", filename);
continue;
}
if (verbose) {
printf("File: %s\n", filename);
printf("Hash: %s\n", hash.format().c_str());
printf("Quality: %d\n", quality);
printf("Image height * width: %d\n", imageHeightTimesWidth);
printf("Read seconds: %.6lf\n", readSeconds);
printf("Hash seconds: %.6lf\n", hashSeconds);
printf("\n");
}
hashes.push_back(hash);
totalReadSeconds += readSeconds;
totalHashSeconds += hashSeconds;
numSuccesses++;
if (numSuccesses == numHashes)
break;
}
}
if (numSuccesses == 0) {
fprintf(stderr, "No images found in folder: %s\n", folderPath.c_str());
return;
}
if (numHashes == 0 || numSuccesses == numHashes)
break;
closedir(dir);
dir = opendir(folderPath.c_str());
}

printf("PHOTO COUNT: %d\n", (int)hashes.size());
if (dihedral) {
printf("TOTAL DIHEDRAL HASHES (8/PHOTO): %d\n", (int)hashes.size() * 8);
}
printf("ERROR COUNT: %d\n", numErrors);
printf("TIME SPENT HASHING PHOTOS (SECONDS): %.6lf\n", totalHashSeconds);
double photosHashedPerSecond =
totalHashSeconds > 0 ? numSuccesses / totalHashSeconds : 0;
printf("PHOTOS HASHED PER SECOND: %.6lf\n", photosHashedPerSecond);

printf(
"TIME SPENT READING PHOTOS (SECONDS): %.6lf\n", totalReadSeconds);
double photosReadPerSecond =
totalReadSeconds > 0 ? numSuccesses / totalReadSeconds : 0;
printf("PHOTOS READ PER SECOND: %.6lf\n", photosReadPerSecond);
printf("\n");
}
59 changes: 59 additions & 0 deletions pdq/cpp/hashing/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Benchmark for hashing
Benchmark for PDQ hashing on images

## Instructions
```
cd pdq/cpp
make bin/benchmark-photo-hasher
./benchmark-photo-hasher
```

Help command:
```
$ ./benchmark-query -h
Usage: ./benchmark-photo-hasher [options] folder_path
Options:
-v Verbose output
-n N Total number of hashes to generate, can be more or less than the number of images in the folder
(default: 0, meaning generate one hash for each image in the folder)
--dihedral Compute dihedral versions of the hashes (default: false)
```

Note: if `-n N` is larger than the number of images in the folder, the benchmark will loop over each image in the folder until `N` hashes have been generated

## Results
Ran on Ubuntu 24.04.1 LTS, Intel Core i7-14700KF with 20 cores, 28 threads, 64GB RAM

```
$ ./benchmark-photo-hasher ../data/reg-test-input/dih/
PHOTO COUNT: 8
ERROR COUNT: 0
TIME SPENT HASHING PHOTOS (SECONDS): 0.015171
PHOTOS HASHED PER SECOND: 527.324158
TIME SPENT READING PHOTOS (SECONDS): 0.299410
PHOTOS READ PER SECOND: 26.719229

$ ./benchmark-photo-hasher -n 10 ../data/reg-test-input/dih/
PHOTO COUNT: 10
ERROR COUNT: 0
TIME SPENT HASHING PHOTOS (SECONDS): 0.018181
PHOTOS HASHED PER SECOND: 550.028442
TIME SPENT READING PHOTOS (SECONDS): 0.334852
PHOTOS READ PER SECOND: 29.863914

$ ./benchmark-photo-hasher -n 100 ../data/reg-test-input/dih/
PHOTO COUNT: 100
ERROR COUNT: 0
TIME SPENT HASHING PHOTOS (SECONDS): 0.182545
PHOTOS HASHED PER SECOND: 547.810364
TIME SPENT READING PHOTOS (SECONDS): 2.841218
PHOTOS READ PER SECOND: 35.196167

$ ./benchmark-photo-hasher -n 1000 ../data/reg-test-input/dih/
PHOTO COUNT: 1000
ERROR COUNT: 0
TIME SPENT HASHING PHOTOS (SECONDS): 1.767847
PHOTOS HASHED PER SECOND: 565.659729
TIME SPENT READING PHOTOS (SECONDS): 27.135609
PHOTOS READ PER SECOND: 36.851948
```
Loading