Skip to content

Commit

Permalink
Fix simhash slicing. Add tests. Fixes dkpro#19.
Browse files Browse the repository at this point in the history
Also includes a more efficient slicing algorithm that
could be used, but requires changes elsewhere in the system
  • Loading branch information
tfmorris committed Mar 25, 2016
1 parent 095760b commit bc739da
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ public static int diffOfBits(long simHash1, long simHash2)
*/
public static Set<String> computeHashIndex(long docHash)
{

//band index
int bandIndex = 0;
//a band (window) used to store a part of the hash (represented in bits)
BitSet bitRange = new BitSet(BAND_WIDTH);
Expand All @@ -124,19 +122,34 @@ public static Set<String> computeHashIndex(long docHash)
Set<String> bandBitset = new HashSet<>();
//divide our HASH_LENGTH-bit hash into bit ranges of BandWidth bits
for (int i = 0; i < HASH_LENGTH; ++i) {
bitRange.set(bitsWidthCounter, ((docHash >> i) & 1) == 1);
if (bitsWidthCounter++ == BAND_WIDTH) {

bitRange.set(bitsWidthCounter++, ((docHash >> i) & 1) == 1);
if (bitsWidthCounter == BAND_WIDTH) {
bandBitset.add(bandIndex + "_" + bitRange.toString());

bitsWidthCounter = 0;
bitRange = new BitSet(BAND_WIDTH); // reset bitRange holder.
bitRange.clear();
bandIndex++;
}
}
return bandBitset;
}

/**
* Slice a 64-bit hash into four hashes, each with all but a 16-bit
* range masked out. It will be used to get similar candidates.
*
* @param docHash
* @return
*/
public static long[] sliceHash(long docHash)
{
long[] result = new long[4];
for (int i = 0; i < 4; i++) {
result[i] = docHash & (0xffffL << (i*16));
}
return result;
}


/**
* Compress the hashes of all the shingles of one document to a single
* fingerprint (SimHash) This implementation is based on the algorithm
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package de.tudarmstadt.ukp.dkpro.c4corpus.deduplication.impl;

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

import org.junit.Test;

/**
* Test for the static methods in SimHashUtils.
*
* NOTE: {@link #testHash()} and {@link #testSimHash()} have had their
* test values just copied from the results. They have *not* been verified
* as correct.
*
* @author Tom Morris <[email protected]>
*
*/
public class SimHashUtilsTest {

@Test
public void testCreateCharGramsShingles() {
String testString = "abcdefghi";
String[] refShingles = { "abcdefg", "bcdefgh", "cdefghi", };
Set<String> refSet = new HashSet<String>(Arrays.asList(refShingles));
assertEquals(refSet, SimHashUtils.createCharGramsShingles(testString));
}

@Test
public void testHash() {
String testString = "abcdefghi";
// FIXME: Verify these hashes are correct
Integer[] refHashes = {-289204219, 627882918, -1206291356};
Set<Integer> refSet = new HashSet<Integer>(Arrays.asList(refHashes));
Set<String> shingles = SimHashUtils.createCharGramsShingles(testString);
Set<Integer> hashes = SimHashUtils.hash(shingles);
assertEquals(refSet, hashes);
}

@Test
public void testDiffOfBits() {
assertEquals(1, SimHashUtils.diffOfBits(0x1L, 0x0L));
}

@Test
public void testComputeHashIndex() {
long hash = 0X0800040002000100L;
String[] refSlices = { "0_{8}", "1_{9}", "2_{10}", "3_{11}" };
Set<String> referenceSet = new HashSet<String>(Arrays.asList(refSlices));

Set<String> slices = SimHashUtils.computeHashIndex(hash);
assertEquals(referenceSet, slices);
}

@Test
public void testSliceHash() {
long hash = 0X0800040002000100L;
long[] refSlices = {
0X0000000000000100L,
0X0000000002000000L,
0X0000040000000000L,
0X0800000000000000L,
};
long[] slices = SimHashUtils.sliceHash(hash);
assertArrayEquals(refSlices, slices);
}

@Test
public void testSimHash() {
Set<Integer> hashValues = SimHashUtils.hash(
SimHashUtils.createCharGramsShingles("abcdefghi"));
// FIXME: Verify that this simhash is correct
assertEquals(-6032228495725610972L, SimHashUtils.simHash(hashValues));
}

}

0 comments on commit bc739da

Please sign in to comment.