Skip to content

Commit

Permalink
Merge branch 'castorini:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
b8zhong authored Dec 22, 2024
2 parents 388cac5 + 6a9cacf commit 2db2549
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 4 deletions.
3 changes: 2 additions & 1 deletion docs/experiments-msmarco-passage.md
Original file line number Diff line number Diff line change
Expand Up @@ -542,4 +542,5 @@ The BM25 run with default parameters `k1=0.9`, `b=0.4` roughly corresponds to th
+ Results reproduced by [@sherloc512](https://github.com/sherloc512) on 2024-12-04 (commit [`9e55b1c`](https://github.com/castorini/anserini/commit/9e55b1c97fced46530dac1f78975d19635ffaf7a))
+ Results reproduced by [@zdann15](https://github.com/zdann15) on 2024-12-04 (commit [`9d311b4`](https://github.com/castorini/anserini/commit/9d311b4409a9ff3d79b01910178eaec3931f0abe))
+ Results reproduced by [@Alireza-Zwolf](https://github.com/Alireza-Zwolf) on 2024-12-15 (commit [`c7dff5f`](https://github.com/castorini/anserini/commit/c7dff5f8417905612ad9f97e85012440e9e16087))
+ Results reproduced by [@Linsen-gao-457](https://github.com/Linsen-gao-457) on 2024-12-17 (commit [a86484a6](https://github.com/castorini/anserini/commit/a86484a6e99a7a97966c423d230ad05279b24508))
+ Results reproduced by [@Linsen-gao-457](https://github.com/Linsen-gao-457) on 2024-12-17 (commit [a86484a6](https://github.com/castorini/anserini/commit/a86484a6e99a7a97966c423d230ad05279b24508))
+ Results reproduced by [@vincent-4](https://github.com/vincent-4) on 2024-12-20 (commit [`c619dc8`](https://github.com/castorini/anserini/commit/c619dc8d9ab28298251964053a927906e9957f51))
3 changes: 2 additions & 1 deletion docs/start-here.md
Original file line number Diff line number Diff line change
Expand Up @@ -426,4 +426,5 @@ If you think this guide can be improved in any way (e.g., you caught a typo or t
+ Results reproduced by [@sherloc512](https://github.com/sherloc512) on 2024-12-04 (commit [`9e55b1c`](https://github.com/castorini/anserini/commit/9e55b1c97fced46530dac1f78975d19635ffaf7a))
+ Results reproduced by [@zdann15](https://github.com/zdann15) on 2024-12-04 (commit [`9d311b4`](https://github.com/castorini/anserini/commit/9d311b4409a9ff3d79b01910178eaec3931f0abe))
+ Results reproduced by [@Alireza-Zwolf](https://github.com/Alireza-Zwolf) on 2024-12-15 (commit [`c7dff5f`](https://github.com/castorini/anserini/commit/c7dff5f8417905612ad9f97e85012440e9e16087))
+ Results reproduced by [@Linsen-gao-457](https://github.com/Linsen-gao-457) on 2024-12-16 (commit [a86484a6](https://github.com/castorini/anserini/commit/a86484a6e99a7a97966c423d230ad05279b24508))
+ Results reproduced by [@Linsen-gao-457](https://github.com/Linsen-gao-457) on 2024-12-16 (commit [a86484a6](https://github.com/castorini/anserini/commit/a86484a6e99a7a97966c423d230ad05279b24508))
+ Results reproduced by [@vincent-4](https://github.com/vincent-4) on 2024-12-20 (commit [`c619dc8`](https://github.com/castorini/anserini/commit/c619dc8d9ab28298251964053a927906e9957f51))
8 changes: 8 additions & 0 deletions src/main/java/io/anserini/search/topicreader/Topics.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,26 @@ public enum Topics {
TREC2021_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl21.unicoil-noexp.0shot.tsv.gz"),
TREC2021_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl21.splade-pp-ed.tsv.gz"),
TREC2021_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl21.splade-pp-sd.tsv.gz"),
TREC2021_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl21.snowflake-arctic-embed-l.jsonl.gz"),
TREC2022_DL(TsvIntTopicReader.class,"topics.dl22.txt"),
TREC2022_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl22.unicoil.0shot.tsv.gz"),
TREC2022_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl22.unicoil-noexp.0shot.tsv.gz"),
TREC2022_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl22.splade-pp-ed.tsv.gz"),
TREC2022_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl22.splade-pp-sd.tsv.gz"),
TREC2022_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl22.snowflake-arctic-embed-l.jsonl.gz"),
TREC2023_DL(TsvIntTopicReader.class, "topics.dl23.txt"),
TREC2023_DL_UNICOIL(TsvIntTopicReader.class,"topics.dl23.unicoil.0shot.tsv.gz"),
TREC2023_DL_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.dl23.unicoil-noexp.0shot.tsv.gz"),
TREC2023_DL_SPLADE_PP_ED(TsvIntTopicReader.class,"topics.dl23.splade-pp-ed.tsv.gz"),
TREC2023_DL_SPLADE_PP_SD(TsvIntTopicReader.class,"topics.dl23.splade-pp-sd.tsv.gz"),
TREC2023_DL_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.dl23.snowflake-arctic-embed-l.jsonl.gz"),

TREC2024_RAG_RAGGY_DEV(TsvIntTopicReader.class, "topics.rag24.raggy-dev.txt"),
TREC2024_RAG_RAGGY_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.rag24.raggy-dev.snowflake-arctic-embed-l.jsonl.gz"),
TREC2024_RAG_RESEARCHY_DEV(TsvIntTopicReader.class, "topics.rag24.researchy-dev.txt"),
TREC2024_RAG_RESEARCHY_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.rag24.researchy-dev.snowflake-arctic-embed-l.jsonl.gz"),
TREC2024_RAG_TEST(TsvStringTopicReader.class, "topics.rag24.test.txt"),
TREC2024_RAG_TEST_SNOWFLAKE_ARCTIC_EMBED_L(JsonStringVectorTopicReader.class, "topics.rag24.test.snowflake-arctic-embed-l.jsonl.gz"),

// MS MARCO V1 topics
MSMARCO_DOC_DEV(TsvIntTopicReader.class,"topics.msmarco-doc.dev.txt"),
Expand All @@ -120,9 +126,11 @@ public enum Topics {
MSMARCO_V2_DOC_DEV(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.txt"),
MSMARCO_V2_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.msmarco-v2-doc.dev.snowflake-arctic-embed-l.jsonl.gz"),
MSMARCO_V2_DOC_DEV2(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.txt"),
MSMARCO_V2_DOC_DEV2_UNICOIL(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.unicoil.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics.msmarco-v2-doc.dev2.unicoil-noexp.0shot.tsv.gz"),
MSMARCO_V2_DOC_DEV2_SNOWFLAKE_ARCTIC_EMBED_L(JsonIntVectorTopicReader.class, "topics.msmarco-v2-doc.dev2.snowflake-arctic-embed-l.jsonl.gz"),
MSMARCO_V2_PASSAGE_DEV(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.txt"),
MSMARCO_V2_PASSAGE_DEV_UNICOIL(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.unicoil.0shot.tsv.gz"),
MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.tsv.gz"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void testIterateThroughAllEnums() {
String path = topic.path;
assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(path));
}
assertEquals(477, cnt);
assertEquals(485, cnt);
}

@Test
Expand Down Expand Up @@ -887,6 +887,14 @@ public void testTREC21DL() throws IOException {
assertEquals(26369, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(1136769, (int) topics.lastKey());
assertEquals(27149, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.TREC2021_DL_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(477, topics.size());
assertEquals(2082, (int) topics.firstKey());
assertEquals("[-0.0054801227524876595", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1136769, (int) topics.lastKey());
assertEquals("[0.0038787610828876495", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand Down Expand Up @@ -933,6 +941,14 @@ public void testTREC22DL() throws IOException {
assertEquals(31052, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(2056473, (int) topics.lastKey());
assertEquals(33891, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.TREC2022_DL_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(500, topics.size());
assertEquals(588, (int) topics.firstKey());
assertEquals("[0.020797204226255417", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(2056473, (int) topics.lastKey());
assertEquals("[0.005524440202862024", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand Down Expand Up @@ -979,6 +995,14 @@ public void testTREC23DL() throws IOException {
assertEquals(163500, topics.get(topics.firstKey()).get("title").split(" ").length);
assertEquals(3100949, (int) topics.lastKey());
assertEquals(181700, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.TREC2023_DL_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(700, topics.size());
assertEquals(2000138, (int) topics.firstKey());
assertEquals("[0.001558756805025041", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(3100949, (int) topics.lastKey());
assertEquals("[0.014963677152991295", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand All @@ -993,6 +1017,14 @@ public void testTREC24_RAG_RAGGY_DEV() throws IOException {
assertEquals(3100918, (int) topics.lastKey());
assertEquals("Can older adults gain strength by training once per week?", topics.get(topics.lastKey()).get("title"));
assertEquals("Can older adults gain strength by training once per week?", topics.get(3100918).get("title"));

topics = TopicReader.getTopics(Topics.TREC2024_RAG_RAGGY_DEV_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(120, topics.size());
assertEquals(23287, (int) topics.firstKey());
assertEquals("[0.008992074057459831", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(3100918, (int) topics.lastKey());
assertEquals("[0.010409535840153694", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand All @@ -1007,6 +1039,14 @@ public void testTREC24_RAG_RESEARCHY_DEV() throws IOException {
assertEquals(1009569, (int) topics.lastKey());
assertEquals("how do video games improve problem solving", topics.get(topics.lastKey()).get("title"));
assertEquals("how do video games improve problem solving", topics.get(1009569).get("title"));

topics = TopicReader.getTopics(Topics.TREC2024_RAG_RESEARCHY_DEV_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(600, topics.size());
assertEquals(429, (int) topics.firstKey());
assertEquals("[0.03783365339040756", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1009569, (int) topics.lastKey());
assertEquals("[0.029290692880749702", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand All @@ -1021,6 +1061,14 @@ public void testTREC24_RAG_TEST() throws IOException {
assertEquals("2024-96485", topics.lastKey());
assertEquals("how would advance electronics course impact students", topics.get(topics.lastKey()).get("title"));
assertEquals("how the solar eclipse can affect mental health", topics.get("2024-79154").get("title"));

topics = TopicReader.getTopics(Topics.TREC2024_RAG_TEST_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(301, topics.size());
assertEquals("2024-105741", topics.firstKey());
assertEquals("[-0.009175633080303669", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals("2024-96485", topics.lastKey());
assertEquals("[0.017953362315893173", topics.get(topics.lastKey()).get("vector").split(",")[0]);
}

@Test
Expand Down Expand Up @@ -1201,6 +1249,14 @@ public void testMSMARCO_V2() throws IOException {
assertEquals(1102390, (int) topics.lastKey());
assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(4552, topics.size());
assertEquals(2, (int) topics.firstKey());
assertEquals("[0.02950862981379032", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1102390, (int) topics.lastKey());
assertEquals("[-0.04409797489643097", topics.get(topics.lastKey()).get("vector").split(",")[0]);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2);
assertNotNull(topics);
assertEquals(5000, topics.size());
Expand All @@ -1225,6 +1281,14 @@ public void testMSMARCO_V2() throws IOException {
assertEquals(1102413, (int) topics.lastKey());
assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_SNOWFLAKE_ARCTIC_EMBED_L);
assertNotNull(topics);
assertEquals(5000, topics.size());
assertEquals(361, (int) topics.firstKey());
assertEquals("[0.002593959914520383", topics.get(topics.firstKey()).get("vector").split(",")[0]);
assertEquals(1102413, (int) topics.lastKey());
assertEquals("[0.006848456338047981", topics.get(topics.lastKey()).get("vector").split(",")[0]);

topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV);
assertNotNull(topics);
assertEquals(3903, topics.size());
Expand Down

0 comments on commit 2db2549

Please sign in to comment.