Skip to content

Commit bfa2fb6

Browse files
committed
merge TPC-AI#8
2 parents 6dbcfbc + 0f263c1 commit bfa2fb6

4 files changed

Lines changed: 18 additions & 14 deletions

File tree

deduplication/__main__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
if args.mode == "bloom":
1010
if args.single:
1111
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"
12-
dedup_single_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing)
12+
dedup_single_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing, skip_insertion=args.skip_insertion)
1313
elif args.multi:
14-
dedup_multi_bloom(args.input, args.minhash_dir, args.num, args.fp, args.output_file, args.name, args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing)
14+
dedup_multi_bloom(args.input, args.minhash_dir, args.num, args.fp, args.output_file, args.name, args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing, skip_insertion=args.skip_insertion)
1515
else:
1616
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"
17-
dedup_single_file_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing)
17+
dedup_single_file_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing, skip_insertion=args.skip_insertion)
1818
else:
1919
if args.single:
2020
assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list"

deduplication/args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def parse_args():
101101
)
102102
parser.add_argument(
103103
"--skip-insertion",
104-
help="If set, will skip inserting entries to index. THis is a MOCK ARG",
104+
help="If set, will skip inserting unique documents into the index (works only with LSHBloom)",
105105
action="store_true"
106106
)
107107

deduplication/lshbloom.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def __init__(self, minhash_dir: str, lsh_params: Dict):
3131
self.minhash_dir = minhash_dir
3232
self.lsh = MinHashLSHBloom(**lsh_params)
3333

34-
def deduplicate_corpus(self) -> List[Tuple[str]]:
34+
def deduplicate_corpus(self, skip_insertion: bool = False) -> List[Tuple[str]]:
3535
"""
3636
Deduplicates documents in the given corpus and adds them to the LSH index if appropriate.
3737
Documents without existing duplicates will be stored in the LSH index for future deduplication.
@@ -45,12 +45,12 @@ def deduplicate_corpus(self) -> List[Tuple[str]]:
4545
if f.endswith(".pkl")
4646
]
4747
for minhashfile in minhash_files:
48-
dups = self.deduplicate_minhash_file(minhashfile)
48+
dups = self.deduplicate_minhash_file(minhashfile, skip_insertion=skip_insertion)
4949
duplicate_list.extend(dups)
5050

5151
return duplicate_list
5252

53-
def deduplicate_and_insert(self, params: Tuple) -> List[Tuple[str]]:
53+
def deduplicate_and_insert(self, params: Tuple, skip_insertion: bool = False) -> List[Tuple[str]]:
5454
"""
5555
Deduplicates a MinHash signature corresponding to a document using the provided LSH index.
5656
If the document is not duplicated in the LSH index, it is added to the index.
@@ -67,13 +67,13 @@ def deduplicate_and_insert(self, params: Tuple) -> List[Tuple[str]]:
6767

6868
# insert if not duplicated in index
6969
if not result:
70-
# WARNING YADU: Hack! We are skipping insertion
71-
# self.lsh.insert(m_query)
70+
if not skip_insertion:
71+
self.lsh.insert(m_query)
7272
return None
7373

7474
return [(key,)]
7575

76-
def deduplicate_minhash_file(self, minhashfile: str) -> List[Tuple[str]]:
76+
def deduplicate_minhash_file(self, minhashfile: str, skip_insertion: bool = False) -> List[Tuple[str]]:
7777
"""
7878
Deduplicate documents in the given minhash file and adds them to the LSH index if appropriate.
7979
Documents without existing duplicates will be stored in the LSH index for future deduplication.
@@ -92,7 +92,7 @@ def deduplicate_minhash_file(self, minhashfile: str) -> List[Tuple[str]]:
9292
# can't multiprocess here as insertion requires C++ dependencies that are not compatible with pickle
9393
with tqdm(total=len(minhash_list), desc=fname) as pbar:
9494
for i in range(len(minhash_list)):
95-
result = self.deduplicate_and_insert(minhash_list[i])
95+
result = self.deduplicate_and_insert(minhash_list[i], skip_insertion=skip_insertion)
9696
if result:
9797
duplicate_list.extend(result)
9898
pbar.update()

deduplication/workflows.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def dedup_single_bloom(
121121
save_dir: str = "./",
122122
compute_minhashes: bool = True,
123123
clear: bool = False,
124+
skip_insertion: bool = False,
124125
):
125126
if clear:
126127
clear_dir(save_dir)
@@ -138,7 +139,7 @@ def dedup_single_bloom(
138139
m.process()
139140

140141
index = LSHBloom(minhash_dir, lsh_params)
141-
duplicates = index.deduplicate_corpus()
142+
duplicates = index.deduplicate_corpus(skip_insertion=skip_insertion)
142143
write_duplicates_to_csv(duplicates, csvfile, corpus_name, header=["dup_key"])
143144

144145

@@ -155,6 +156,7 @@ def dedup_multi_bloom(
155156
save_dir: str = "./",
156157
compute_minhashes: bool = True,
157158
clear: bool = False,
159+
skip_insertion: bool = False,
158160
):
159161
assert len(input_dirs) == len(minhash_dirs) == len(corpus_names), \
160162
f"Expected len(input_dirs) == len(minhash_dirs) == len(corpus_names), got {len(input_dirs)}, {len(minhash_dirs)}, {len(corpus_names)}"
@@ -174,7 +176,8 @@ def dedup_multi_bloom(
174176
n_hash_funcs,
175177
save_dir,
176178
compute_minhashes,
177-
clear=False
179+
clear=False,
180+
skip_insertion=skip_insertion
178181
)
179182

180183
def dedup_single_file_bloom(
@@ -189,6 +192,7 @@ def dedup_single_file_bloom(
189192
save_dir: str = "./",
190193
compute_minhashes: bool = True,
191194
clear: bool = False,
195+
skip_insertion: bool = False,
192196
):
193197
if clear:
194198
clear_dir(save_dir)
@@ -208,5 +212,5 @@ def dedup_single_file_bloom(
208212
fname = input_file.split("/")[-1]
209213
minhash_file = f"{minhash_dir}/{fname[:-6]}.pkl"
210214
index = LSHBloom(minhash_dir, lsh_params)
211-
duplicates = index.deduplicate_minhash_file(minhash_file)
215+
duplicates = index.deduplicate_minhash_file(minhash_file, skip_insertion=skip_insertion)
212216
write_duplicates_to_csv(duplicates, csvfile, corpus_name, header=["dup_key"])

0 commit comments

Comments
 (0)