stanford-futuredata · jenhsia · Mar 10, 2024 · Mar 10, 2024
diff --git a/colbert/data/collection.py b/colbert/data/collection.py
@@ -33,7 +33,9 @@ def _load_file(self, path):
         return self._load_tsv(path) if path.endswith('.tsv') else self._load_jsonl(path)
 
     def _load_tsv(self, path):
-        return load_collection(path)
+        collection, pid_list = load_collection(path)
+        self.pid_list = pid_list
+        return collection
 
     def _load_jsonl(self, path):
         raise NotImplementedError()

diff --git a/colbert/evaluation/loaders.py b/colbert/evaluation/loaders.py
@@ -156,13 +156,15 @@ def load_collection(collection_path):
     print_message("#> Loading collection...")
 
     collection = []
+    pid_list = []
 
     with open(collection_path) as f:
         for line_idx, line in enumerate(f):
             if line_idx % (1000*1000) == 0:
                 print(f'{line_idx // 1000 // 1000}M', end=' ', flush=True)
 
             pid, passage, *rest = line.strip('\n\r ').split('\t')
+            pid_list.append(pid)
             assert pid == 'id' or int(pid) == line_idx, f"pid={pid}, line_idx={line_idx}"
 
             if len(rest) >= 1:
@@ -173,7 +175,7 @@ def load_collection(collection_path):
 
     print()
 
-    return collection
+    return collection, pid_list
 
 
 def load_colbert(args, do_print=True):

diff --git a/colbert/indexing/collection_indexer.py b/colbert/indexing/collection_indexer.py
@@ -435,6 +435,35 @@ def _collect_embedding_id_offset(self):
         self.num_embeddings = embedding_offset
         assert len(self.embedding_offsets) == self.num_chunks
 
+
+    def _get_ivf(self, codes):
+        ivf_dict = {}
+        Run().print_main(f"Code size {codes.shape[0]}")
+        Run().print_main(f"Creating empty ivf dict")
+        for p_id in range(self.num_partitions):
+            ivf_dict[p_id] = []
+
+        Run().print_main(f"Populating ivf dict")
+        for c_idx in range(codes.shape[0]):
+            if (c_idx%10_000_000 == 0):
+                Run().print_main(f"{c_idx}")
+            p_id = codes[c_idx].item()
+            ivf_dict[p_id].append(c_idx)
+
+        ivf = []
+        ivf_lengths = torch.zeros(self.num_partitions).long()
+
+        Run().print_main(f"Get ivf lengths")
+        for p_id in range(self.num_partitions):
+            if (p_id %100 == 0):
+                Run().print_main(f"{p_id}")
+            ivf_lengths[p_id] = len(ivf_dict[p_id])
+            ivf.extend(ivf_dict[p_id])
+        ivf = torch.tensor(ivf)
+        ivf_lengths = torch.tensor(ivf_lengths)
+
+        return ivf, ivf_lengths
+
     def _build_ivf(self):
         # Maybe we should several small IVFs? Every 250M embeddings, so that's every 1 GB.
         # It would save *memory* here and *disk space* regarding the int64.
@@ -464,15 +493,12 @@ def _build_ivf(self):
 
             print_memory_stats(f'RANK:{self.rank}')
 
-        codes = codes.sort()
-        ivf, values = codes.indices, codes.values
-
         if self.verbose > 1:
             print_memory_stats(f'RANK:{self.rank}')
 
             Run().print_main(f"Getting unique codes...")
 
-        ivf_lengths = torch.bincount(values, minlength=self.num_partitions)
+        ivf, ivf_lengths = self._get_ivf(codes)
         assert ivf_lengths.size(0) == self.num_partitions
 
         if self.verbose > 1:

diff --git a/colbert/searcher.py b/colbert/searcher.py
@@ -37,6 +37,7 @@ def __init__(self, index, checkpoint=None, collection=None, config=None, index_r
         self.config = ColBERTConfig.from_existing(self.checkpoint_config, self.index_config, initial_config)
 
         self.collection = Collection.cast(collection or self.config.collection)
+        self.pid_list = self.idx2pid(self.config.collection)
         self.configure(checkpoint=self.checkpoint, collection=self.collection)
 
         self.checkpoint = Checkpoint(self.checkpoint, colbert_config=self.config, verbose=self.verbose)
@@ -49,6 +50,14 @@ def __init__(self, index, checkpoint=None, collection=None, config=None, index_r
         self.ranker = IndexScorer(self.index, use_gpu, load_index_with_mmap)
 
         print_memory_stats()
+
+    def idx2pid(self, collection_path):
+        pid_list = []
+        with open(collection_path) as f:
+            for line_idx, line in enumerate(f):
+                pid, passage, *rest = line.strip('\n\r ').split('\t')
+                pid_list.append(pid)
+        return pid_list
 
     def configure(self, **kw_args):
         self.config.configure(**kw_args)