-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_f_placer.py
More file actions
52 lines (41 loc) · 2.04 KB
/
_f_placer.py
File metadata and controls
52 lines (41 loc) · 2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import csv
import multiprocessing
from _f_indexer import forward_indexer
from os import listdir
from os.path import os
from os.path import isfile, join
from _helper import docs_path, indexed_docs_path, f_index_path, stopwords_path, docs_subdir_log, \
read_doc_sub_directories
from _helper import get_out_path_for_f_index
output_path = ""
sub_d = ""
while 1:
# get name of all documents in the directory which contain only those documents that are indexed or are to be
# indexed
docs = [fa for fa in listdir(docs_path) if isfile(join(docs_path, fa))]
# docs_indexed contain name of the documents in doc_path/docs that are already indexed
# indexed_docs_path contain indexed docs
docs_indexed = [f for f in listdir(indexed_docs_path) if isfile(join(indexed_docs_path, f))]
# print(docs_indexed)
# exit(0)
# extracting name of folder in f_index directory
# f_index directory contain forward indices
# if data set contain multiple files then f_index contain folder for each file and in it contains its forward_index
dir_dict = {}
for doc in docs:
f_index_folder = [dI for dI in os.listdir(f_index_path) if os.path.isdir(os.path.join(f_index_path, dI))]
# print(f_index_folder)
# exit(0)
# reading doc_subdirectories file to get name of all files that are indexed
dir_dict = read_doc_sub_directories()
# if doc not in indexed directory then it needs to be indexed
if doc not in dir_dict:
# if f_index list is empty it means no folder in directory
output_path, sub_d = get_out_path_for_f_index(doc, f_index_folder)
# docs_path_n is the path of nth doc in docs_path directory
docs_path_n = docs_path + "\\" + str(doc)
forward_indexer(stopwords_path, docs_path_n, output_path)
# against new forward index document write name of the folder it is stored in
with open(docs_subdir_log, 'a') as out:
writer = csv.writer(out)
writer.writerow([str(doc), sub_d])