@@ -31,7 +31,7 @@ def __init__(self, minhash_dir: str, lsh_params: Dict):
3131 self .minhash_dir = minhash_dir
3232 self .lsh = MinHashLSHBloom (** lsh_params )
3333
34- def deduplicate_corpus (self ) -> List [Tuple [str ]]:
34+ def deduplicate_corpus (self , skip_insertion : bool = False ) -> List [Tuple [str ]]:
3535 """
3636 Deduplicates documents in the given corpus and adds them to the LSH index if appropriate.
3737 Documents without existing duplicates will be stored in the LSH index for future deduplication.
@@ -45,12 +45,12 @@ def deduplicate_corpus(self) -> List[Tuple[str]]:
4545 if f .endswith (".pkl" )
4646 ]
4747 for minhashfile in minhash_files :
48- dups = self .deduplicate_minhash_file (minhashfile )
48+ dups = self .deduplicate_minhash_file (minhashfile , skip_insertion = skip_insertion )
4949 duplicate_list .extend (dups )
5050
5151 return duplicate_list
5252
53- def deduplicate_and_insert (self , params : Tuple ) -> List [Tuple [str ]]:
53+ def deduplicate_and_insert (self , params : Tuple , skip_insertion : bool = False ) -> List [Tuple [str ]]:
5454 """
5555 Deduplicates a MinHash signature corresponding to a document using the provided LSH index.
5656 If the document is not duplicated in the LSH index, it is added to the index.
@@ -67,13 +67,13 @@ def deduplicate_and_insert(self, params: Tuple) -> List[Tuple[str]]:
6767
6868 # insert if not duplicated in index
6969 if not result :
70- # WARNING YADU: Hack! We are skipping insertion
71- # self.lsh.insert(m_query)
70+ if not skip_insertion :
71+ self .lsh .insert (m_query )
7272 return None
7373
7474 return [(key ,)]
7575
76- def deduplicate_minhash_file (self , minhashfile : str ) -> List [Tuple [str ]]:
76+ def deduplicate_minhash_file (self , minhashfile : str , skip_insertion : bool = False ) -> List [Tuple [str ]]:
7777 """
7878 Deduplicate documents in the given minhash file and adds them to the LSH index if appropriate.
7979 Documents without existing duplicates will be stored in the LSH index for future deduplication.
@@ -92,7 +92,7 @@ def deduplicate_minhash_file(self, minhashfile: str) -> List[Tuple[str]]:
9292 # can't multiprocess here as insertion requires C++ dependencies that are not compatible with pickle
9393 with tqdm (total = len (minhash_list ), desc = fname ) as pbar :
9494 for i in range (len (minhash_list )):
95- result = self .deduplicate_and_insert (minhash_list [i ])
95+ result = self .deduplicate_and_insert (minhash_list [i ], skip_insertion = skip_insertion )
9696 if result :
9797 duplicate_list .extend (result )
9898 pbar .update ()
0 commit comments