Skip to content

Commit

Permalink
Adds bindings for arctic topics (castorini#2666)
Browse files Browse the repository at this point in the history
  • Loading branch information
UShivani3 authored Dec 21, 2024
1 parent 65b6c6e commit 6a9cacf
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 2 deletions.
8 changes: 8 additions & 0 deletions src/main/java/io/anserini/search/topicreader/Topics.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,26 @@ public enum Topics {
TREC2021_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl21.unicoil-noexp.0shot.tsv.gz"),
TREC2021_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl21.splade-pp-ed.tsv.gz"),
TREC2021_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl21.splade-pp-sd.tsv.gz"),
TREC2021_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl21.snowflake-arctic-embed-l.jsonl.gz"),
TREC2022_DL(TsvIntTopicReader.class,"topics.dl22.txt"),
TREC2022_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl22.unicoil.0shot.tsv.gz"),
TREC2022_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl22.unicoil-noexp.0shot.tsv.gz"),
TREC2022_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl22.splade-pp-ed.tsv.gz"),
TREC2022_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl22.splade-pp-sd.tsv.gz"),
TREC2022_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl22.snowflake-arctic-embed-l.jsonl.gz"),
TREC2023_DL(TsvIntTopicReader.class, "topics.dl23.txt"),
TREC2023_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl23.unicoil.0shot.tsv.gz"),
TREC2023_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl23.unicoil-noexp.0shot.tsv.gz"),
TREC2023_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl23.splade-pp-ed.tsv.gz"),
TREC2023_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl23.splade-pp-sd.tsv.gz"),
TREC2023_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl23.snowflake-arctic-embed-l.jsonl.gz"),

TREC2024_RAG_RAGGY_DEV(TsvIntTopicReader.class, "topics.rag24.raggy-dev.txt"),
TREC2024_RAG_RAGGY_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.rag24.raggy-dev.snowflake-arctic-embed-l.jsonl.gz"),
TREC2024_RAG_RESEARCHY_DEV(TsvIntTopicReader.class, "topics.rag24.researchy-dev.txt"),
TREC2024_RAG_RESEARCHY_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.rag24.researchy-dev.snowflake-arctic-embed-l.jsonl.gz"),
TREC2024_RAG_TEST(TsvStringTopicReader.class, "topics.rag24.test.txt"),
TREC2024_RAG_TEST_SNOWFLAKE_ARCTIC_EMBED_L(JsonStringVectorTopicReader.class, "topics.rag24.test.snowflake-arctic-embed-l.jsonl.gz"),

// MS MARCO V1 topics
MSMARCO_DOC_DEV(TsvIntTopicReader.class,"topics.msmarco-doc.dev.txt"),
Expand All @@ -120,9 +126,11 @@ public enum Topics {
MSMARCO_V2_DOC_DEV(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.txt"),
MSMARCO_V2_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.msmarco-v2-doc.dev.snowflake-arctic-embed-l.jsonl.gz"),
MSMARCO_V2_DOC_DEV2(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.txt"),
MSMARCO_V2_DOC_DEV2_UNICOIL(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.unicoil.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV2_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.msmarco-v2-doc.dev2.snowflake-arctic-embed-l.jsonl.gz"),
MSMARCO_V2_PASSAGE_DEV(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.txt"),
MSMARCO_V2_PASSAGE_DEV_UNICOIL(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.unicoil.0shot.tsv.gz"),
MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.tsv.gz"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void testIterateThroughAllEnums() {
String path = topic.path;
assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(path));
}
assertEquals(477, cnt);
assertEquals(485, cnt);
}

@Test
Expand Down Expand Up @@ -887,6 +887,14 @@ public void testTREC21DL() throws IOException {
assertEquals(26369, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1136769, (int) topics.lastKey());
assertEquals(27149, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.TREC2021_DL_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(477, topics.size());
assertEquals(2082, (int) topics.firstKey());
assertEquals("[-0.0054801227524876595", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1136769, (int) topics.lastKey());
assertEquals("[0.0038787610828876495", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand Down Expand Up @@ -933,6 +941,14 @@ public void testTREC22DL() throws IOException {
assertEquals(31052, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(2056473, (int) topics.lastKey());
assertEquals(33891, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.TREC2022_DL_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(500, topics.size());
assertEquals(588, (int) topics.firstKey());
assertEquals("[0.020797204226255417", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(2056473, (int) topics.lastKey());
assertEquals("[0.005524440202862024", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand Down Expand Up @@ -979,6 +995,14 @@ public void testTREC23DL() throws IOException {
assertEquals(163500, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(3100949, (int) topics.lastKey());
assertEquals(181700, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.TREC2023_DL_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(700, topics.size());
assertEquals(2000138, (int) topics.firstKey());
assertEquals("[0.001558756805025041", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(3100949, (int) topics.lastKey());
assertEquals("[0.014963677152991295", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand All @@ -993,6 +1017,14 @@ public void testTREC24_RAG_RAGGY_DEV() throws IOException {
assertEquals(3100918, (int) topics.lastKey());
assertEquals("Can older adults gain strength by training once per week?", topics.get(topics.lastKey()).get("title"));
assertEquals("Can older adults gain strength by training once per week?", topics.get(3100918).get("title"));

topics = TopicReader.getTopics(Topics.TREC2024_RAG_RAGGY_DEV_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(120, topics.size());
assertEquals(23287, (int) topics.firstKey());
assertEquals("[0.008992074057459831", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(3100918, (int) topics.lastKey());
assertEquals("[0.010409535840153694", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand All @@ -1007,6 +1039,14 @@ public void testTREC24_RAG_RESEARCHY_DEV() throws IOException {
assertEquals(1009569, (int) topics.lastKey());
assertEquals("how do video games improve problem solving", topics.get(topics.lastKey()).get("title"));
assertEquals("how do video games improve problem solving", topics.get(1009569).get("title"));

topics = TopicReader.getTopics(Topics.TREC2024_RAG_RESEARCHY_DEV_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(600, topics.size());
assertEquals(429, (int) topics.firstKey());
assertEquals("[0.03783365339040756", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1009569, (int) topics.lastKey());
assertEquals("[0.029290692880749702", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand All @@ -1021,6 +1061,14 @@ public void testTREC24_RAG_TEST() throws IOException {
assertEquals("2024-96485", topics.lastKey());
assertEquals("how would advance electronics course impact students", topics.get(topics.lastKey()).get("title"));
assertEquals("how the solar eclipse can affect mental health", topics.get("2024-79154").get("title"));

topics = TopicReader.getTopics(Topics.TREC2024_RAG_TEST_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(301, topics.size());
assertEquals("2024-105741", topics.firstKey());
assertEquals("[-0.009175633080303669", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals("2024-96485", topics.lastKey());
assertEquals("[0.017953362315893173", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand Down Expand Up @@ -1201,6 +1249,14 @@ public void testMSMARCO_V2() throws IOException {
assertEquals(1102390, (int) topics.lastKey());
assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(4552, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("[0.02950862981379032", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1102390, (int) topics.lastKey());
assertEquals("[-0.04409797489643097", topics.get(topics.lastKey()).get("vector").split(",")[0]);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2);
assertNotNull(topics);
assertEquals(5000, topics.size());
Expand All @@ -1225,6 +1281,14 @@ public void testMSMARCO_V2() throws IOException {
assertEquals(1102413, (int) topics.lastKey());
assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(5000, topics.size());
assertEquals(361, (int) topics.firstKey());
assertEquals("[0.002593959914520383", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1102413, (int) topics.lastKey());
assertEquals("[0.006848456338047981", topics.get(topics.lastKey()).get("vector").split(",")[0]);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV);
assertNotNull(topics);
assertEquals(3903, topics.size());
Expand Down

0 comments on commit 6a9cacf

Please sign in to comment.