diff --git a/dkpro-c4corpus-deduplication/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtils.java b/dkpro-c4corpus-deduplication/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtils.java index c6a1751..59aacfa 100644 --- a/dkpro-c4corpus-deduplication/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtils.java +++ b/dkpro-c4corpus-deduplication/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtils.java @@ -114,8 +114,6 @@ public static int diffOfBits(long simHash1, long simHash2) */ public static Set computeHashIndex(long docHash) { - - //band index int bandIndex = 0; //a band (window) used to store a part of the hash (represented in bits) BitSet bitRange = new BitSet(BAND_WIDTH); @@ -124,19 +122,34 @@ public static Set computeHashIndex(long docHash) Set bandBitset = new HashSet<>(); //divide our HASH_LENGTH-bit hash into bit ranges of BandWidth bits for (int i = 0; i < HASH_LENGTH; ++i) { - bitRange.set(bitsWidthCounter, ((docHash >> i) & 1) == 1); - if (bitsWidthCounter++ == BAND_WIDTH) { - + bitRange.set(bitsWidthCounter++, ((docHash >> i) & 1) == 1); + if (bitsWidthCounter == BAND_WIDTH) { bandBitset.add(bandIndex + "_" + bitRange.toString()); - bitsWidthCounter = 0; - bitRange = new BitSet(BAND_WIDTH); // reset bitRange holder. + bitRange.clear(); bandIndex++; } } return bandBitset; } + /** + * Slice a 64-bit hash into four hashes, each with all but a 16-bit + * range masked out. It will be used to get similar candidates. + * + * @param docHash + * @return + */ + public static long[] sliceHash(long docHash) + { + long[] result = new long[4]; + for (int i = 0; i < 4; i++) { + result[i] = docHash & (0xffffL << (i*16)); + } + return result; + } + + /** * Compress the hashes of all the shingles of one document to a single * fingerprint (SimHash) This implementation is based on the algorithm diff --git a/dkpro-c4corpus-deduplication/src/test/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtilsTest.java b/dkpro-c4corpus-deduplication/src/test/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtilsTest.java new file mode 100644 index 0000000..c703d7f --- /dev/null +++ b/dkpro-c4corpus-deduplication/src/test/java/de/tudarmstadt/ukp/dkpro/c4corpus/deduplication/impl/SimHashUtilsTest.java @@ -0,0 +1,79 @@ +package de.tudarmstadt.ukp.dkpro.c4corpus.deduplication.impl; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; + +/** + * Test for the static methods in SimHashUtils. + * + * NOTE: {@link #testHash()} and {@link #testSimHash()} have had their + * test values just copied from the results. They have *not* been verified + * as correct. + * + * @author Tom Morris + * + */ +public class SimHashUtilsTest { + + @Test + public void testCreateCharGramsShingles() { + String testString = "abcdefghi"; + String[] refShingles = { "abcdefg", "bcdefgh", "cdefghi", }; + Set refSet = new HashSet(Arrays.asList(refShingles)); + assertEquals(refSet, SimHashUtils.createCharGramsShingles(testString)); + } + + @Test + public void testHash() { + String testString = "abcdefghi"; + // FIXME: Verify these hashes are correct + Integer[] refHashes = {-289204219, 627882918, -1206291356}; + Set refSet = new HashSet(Arrays.asList(refHashes)); + Set shingles = SimHashUtils.createCharGramsShingles(testString); + Set hashes = SimHashUtils.hash(shingles); + assertEquals(refSet, hashes); + } + + @Test + public void testDiffOfBits() { + assertEquals(1, SimHashUtils.diffOfBits(0x1L, 0x0L)); + } + + @Test + public void testComputeHashIndex() { + long hash = 0X0800040002000100L; + String[] refSlices = { "0_{8}", "1_{9}", "2_{10}", "3_{11}" }; + Set referenceSet = new HashSet(Arrays.asList(refSlices)); + + Set slices = SimHashUtils.computeHashIndex(hash); + assertEquals(referenceSet, slices); + } + + @Test + public void testSliceHash() { + long hash = 0X0800040002000100L; + long[] refSlices = { + 0X0000000000000100L, + 0X0000000002000000L, + 0X0000040000000000L, + 0X0800000000000000L, + }; + long[] slices = SimHashUtils.sliceHash(hash); + assertArrayEquals(refSlices, slices); + } + + @Test + public void testSimHash() { + Set hashValues = SimHashUtils.hash( + SimHashUtils.createCharGramsShingles("abcdefghi")); + // FIXME: Verify that this simhash is correct + assertEquals(-6032228495725610972L, SimHashUtils.simHash(hashValues)); + } + +}