uspto/USPTOLogger.py at master · syspect-sec/uspto · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
# USPTOLogger.py
# USPTO Bulk Data Parser - Processes for Managing Logs
# Description: Processes handles log files.
# Author: Joseph Lee
# Email: joseph@ripplesoftware.ca
# Website: www.ripplesoftware.ca
# Github: www.github.com/rippledj/uspto

# Import Python Modules
import logging
import traceback
import time
import os
import sys
import pprint

# Import USPTO Parser Functions
import USPTOProcessLinks

# Setup logging
def setup_logger(log_level, log_file):

    # Define logger object
    logger = logging.getLogger('USPTO_Database_Construction')
    log_handler = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    log_handler.setFormatter(formatter)
    logger.addHandler(log_handler)
    # Set the log level verbosity
    if log_level == 1:
        logger.setLevel(logging.ERROR)
    elif log_level == 2:
        logger.setLevel(logging.WARNING)
    elif log_level == 3:
        logger.setLevel(logging.INFO)

# Check the args_array log_lock_file and switch and write file as 'Processed'
# TODO accept a passed arg to also write the log as processing, if needed by
# to balance loads using log file in main_process.
def write_process_log(args_array):

    # Set the document type for processing
    document_type = args_array['document_type']
    # Import Logger
    logger = logging.getLogger("USPTO_Database_Construction")

    # Print message to stdout and log file
    print("Updating the log for processed file: " + args_array['url_link'])
    logger.info("Updating the log for processed file: " + args_array['url_link'])

    # Set the log file to check and rewrite based on the document_type passed
    if document_type == "grant" : log_file_to_rewrite = args_array['grant_process_log_file']
    elif document_type == "application" : log_file_to_rewrite = args_array['application_process_log_file']
    elif document_type == "class" : log_file_to_rewrite = args_array['classification_process_log_file']
    elif document_type == "PAIR" : log_file_to_rewrite = args_array['pair_process_log_file']
    elif document_type == "legal" : log_file_to_rewrite = args_array['legal_process_log_file']

    # Variable hold while loop running
    log_rewrite_success = 0

    try:

        while log_rewrite_success == 0:

            # Create an array to store all lines to be rewritten after
            log_rewrite_array = []

            # Open log_lock_file to check status
            print("-- File locking log file for complete " + document_type + " file: " + args_array['url_link'])
            logger.info("-- File locking log file for complete " + document_type + " file: " + args_array['url_link'])
            log_lock = open(args_array["log_lock_file"], "r")
            locked_status = log_lock.read().strip()
            log_lock.close()

            # If the log lock file is set to open, rewrite log with changes and end while loop
            if locked_status == "0":
                # Write log lock as closed
                log_lock = open(args_array["log_lock_file"], "w")
                log_lock.write("1")
                log_lock.close()
                # Open the appropriate log file
                log_file = open(log_file_to_rewrite, "r")
                # Separate into array of arrays of original csv
                log_file_data_array = log_file.readlines()
                log_file.close()

                # Loop through each line in the file
                for line in log_file_data_array:
                    # If the first element in line is the link we have just processed
                    line = line.split(",")
                    if line[0] == args_array["url_link"]:
                        print("-- Found the URL link in log file")
                        logger.info("-- Found the URL link: " + args_array['url_link'] + " in log file")
                        # Append the line with "Processed"
                        log_rewrite_array.append([line[0], line[1], "Processed\n"])
                    # If the first element is not the line we are looking for
                    else:
                        # Append the line as is
                        log_rewrite_array.append(line)

                # Rewrite the new array to the log file in csv
                log_file = open(log_file_to_rewrite, "w")
                #print log_rewrite_array
                for line in log_rewrite_array:
                    #print line[0] + "," + line[1] + "," + line[2]
                    log_file.write(",".join(line))
                log_file.close()

                # Set the log_lock to open again and close the file.
                log_lock = open(args_array["log_lock_file"], "w")
                log_lock.write("0")
                log_lock.close()

                # Print message to stdout and log file
                print("-- Log updated for processed file: " + args_array['url_link'])
                logger.info("-- Log updated for processed file: " + args_array['url_link'])

                # End the while loop while by setting file_locked
                log_rewrite_success = 1

            # If the file was found to be locked by another process, close file then wait 1 second
            else:
                #print "waiting on log lock to be opened"
                log_lock.close()
                time.sleep(1)

    except Exception as e:
        # Print and log general fail comment
        print("-- Exception during the writing of " + document_type + " log file: " + args_array['url_link'])
        logger.error("-- Exception during the writing of " + document_type + " log file: " + args_array['url_link'])
        traceback.print_exc()
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc())


# Check the args_array log_lock_file and switch and write file as 'Verified'
# TODO accept a passed arg to also write the log as processing, if needed by
# to balance loads using log file in main_process.
def write_verified_log(args_array):

    # Set the document type for processing
    document_type = args_array['document_type']

    logger = logging.getLogger("USPTO_Database_Construction")

    # Print message to stdout and log file
    print("[Updating the log for verified file: " + args_array['url_link'] + "]")
    logger.info("[Updating the log for verified file: " + args_array['url_link'] + "]")

    # Set the log file to check and rewrite based on the document_type passed
    if document_type == "grant" : log_file_to_rewrite = args_array['grant_process_log_file']
    elif document_type == "application" : log_file_to_rewrite = args_array['application_process_log_file']
    elif document_type == "class" : log_file_to_rewrite = args_array['classification_process_log_file']
    elif document_type == "PAIR" : log_file_to_rewrite = args_array['pair_process_log_file']
    elif document_type == "legal" : log_file_to_rewrite = args_array['legal_process_log_file']

    # Variable hold while loop running
    log_rewrite_success = 0
    # Hold the loop until log has been modified
    while log_rewrite_success == 0:

        # Create an array to store all lines to be rewritten after
        log_rewrite_array = []
        # Open log_lock_file to check status
        log_lock = open(args_array["log_lock_file"], "r")
        locked_status = log_lock.read().strip()
        log_lock.close()

        # If the log lock file is set to open, rewrite log with changes and end while loop
        if locked_status == "0":
            # Write log lock as closed
            log_lock = open(args_array["log_lock_file"], "w")
            log_lock.write("1")
            log_lock.close()
            # Open the appropriate log file
            log_file = open(log_file_to_rewrite, "r")
            # Separate into array of arrays of original csv
            log_file_data_array = log_file.readlines()
            log_file.close()

            # Loop through each line in the file
            for line in log_file_data_array:
                # If the first element in line is the link we have just processed
                line = line.split(",")
                if line[0] == args_array["url_link"]:
                    print("-- Found the URL link in log file")
                    # Append the line with "Processed"
                    log_rewrite_array.append([line[0], line[1], "Processed", "Verified\n"])
                # If the first element is not the line we are looking for
                else:
                    # Append the line as is
                    log_rewrite_array.append(line)

            # Rewrite the new array to the log file in csv
            log_file = open(log_file_to_rewrite, "w")
            for line in log_rewrite_array:
                #print line[0] + "," + line[1] + "," + line[2]
                log_file.write(",".join(line))
            log_file.close()

            # Set the log_lock to open again and close the file.
            log_lock = open(args_array["log_lock_file"], "w")
            log_lock.write("0")
            log_lock.close()

            # Print message to stdout and log file
            print("[Log updated for verified file: " + args_array['url_link'] + "]")
            logger.info("[Log updated for verified file: " + args_array['url_link'] + "]")

            # End the while loop while by setting file_locked
            log_rewrite_success = 1

        # If the file was found to be locked by another process, close file then wait 1 second
        else:
            log_lock.close()
            time.sleep(1)

# Write all log links to files
def write_link_arrays_to_file(all_links_array, args_array):

    # Import logger
    logger = logging.getLogger("USPTO_Database_Construction")
    # Log writing of link arrays to file
    logger.info('Writing all required links to file ' + time.strftime("%c"))

    # Write all required links into file

    # Write all grant and application links to separate files
    if not os.path.isfile(args_array['grant_process_log_file']):
        grant_process_file = open(args_array['grant_process_log_file'], "w")
        for item in all_links_array["grants"]:
            grant_process_file.write(item[0] + "," + item[1] + ",Unprocessed\n")
        grant_process_file.close()
    if not os.path.isfile(args_array['application_process_log_file']):
        application_process_file = open(args_array['application_process_log_file'], "w")
        for item in all_links_array["applications"]:
            application_process_file.write(item[0] + "," + item[1] + ",Unprocessed\n")
        application_process_file.close()
    if not os.path.isfile(args_array['classification_process_log_file']):
        classification_process_file = open(args_array['classification_process_log_file'], "w")
        for item in all_links_array["classifications"]:
            classification_process_file.write(item[0] + "," + item[1] + ",Unprocessed\n")
        classification_process_file.close()
    if not os.path.isfile(args_array['pair_process_log_file']):
        pair_process_file = open(args_array['pair_process_log_file'], "w")
        for item in all_links_array["PAIR"]:
            pair_process_file.write(item[0] + "," + item[1] + ",Unprocessed\n")
        pair_process_file.close()
    if not os.path.isfile(args_array['legal_process_log_file']):
        legal_process_file = open(args_array['legal_process_log_file'], "w")
        for item in all_links_array["legal"]:
            legal_process_file.write(item[0] + "," + item[1] + ",Unprocessed\n")
        legal_process_file.close()

    # Write finished message to log
    logger.info('Finished writing all patent data links to files. Finshed Time: ' + time.strftime("%c"))
    print("Finished writing all patent data links to files. Finshed Time: " + time.strftime("%c"))

# Write all log links to files
def update_link_arrays_to_file(all_links_array, args_array):

    # Import logger
    logger = logging.getLogger("USPTO_Database_Construction")
    print('Updating all source data links to file ' + time.strftime("%c"))
    logger.info('Updating all source data links to file ' + time.strftime("%c"))

    # Open files and read in data to check lines for links that exist already
    grant_process_file = open(args_array['grant_process_log_file'], "r+")
    application_process_file = open(args_array['application_process_log_file'], "r+")

    grant_process_data_array = grant_process_file.readlines()
    print(str(len(grant_process_data_array)) + " existing grant links were found in the log file")

    application_process_data_array = application_process_file.readlines()
    print(str(len(application_process_data_array)) + " existing application links were found in the log file")

    # Close the process log files
    grant_process_file.close()
    application_process_file.close()

    # Check if new found grant links exist already in file
    for new_item in all_links_array['grants']:
        # Define a flag for if new link found in existing list
        link_found_flag = False
        # Loop through all existing links found in file
        for item in grant_process_data_array:
            # Break the csv format into array
            item = item.split(",")
            # If match between links is found
            if new_item[0] == item[0]:
                # Set flag that link is found
                link_found_flag = True
                break
        # If flag is not found
        if link_found_flag == False:
            print("- New patent grant data file found..." + new_item[0])
            # Append the new links to array
            grant_process_data_array.append(new_item[0] + "," + new_item[1] + ",Processed\n")

    # Check if new found grant links exist already in file
    for new_item in all_links_array['applications']:
        # Define a flag for if new link found in existing list
        link_found_flag = False
        # Loop through all existing links found in file
        for item in application_process_data_array:
            # Break the csv format into array
            item = item.split(",")
            # If match between links is found
            if new_item[0] == item[0]:
                # Set flag that link is found
                link_found_flag = True
                break
        # If flag is not found
        if link_found_flag == False:
            print("- New patent application data file found..." + new_item[0])
            # Append the new links to array
            application_process_data_array.append(new_item[0] + "," + new_item[1] + ",Processed\n")

    grant_process_file = open(args_array['grant_process_log_file'], "w")
    application_process_file = open(args_array['application_process_log_file'], "w")

    # Write the new grant_process_data_array to the original log file
    for item in grant_process_data_array:
        grant_process_file.write(item)
    print('Updated grant links written to log file ' + time.strftime("%c"))
    logger.info('Updated grant links written to log file ' + time.strftime("%c"))
    # Write the new grant_process_data_array to the original log file
    for item in application_process_data_array:
        application_process_file.write(item)
    print('Updated application links written to log file ' + time.strftime("%c"))
    logger.info('Updated application links written to log file ' + time.strftime("%c"))

    # Close files
    grant_process_file.close()
    application_process_file.close()

    print("Finished updating all patent grant and application links to log files. Finshed Time: " + time.strftime("%c"))
    logger.info('Finished updating all patent grant and application links to log files ' + time.strftime("%c"))

# Collect all links from file
def collect_all_required_links_from_file(args_array):

    logger = logging.getLogger("USPTO_Database_Construction")

    # Initialize file arrays for temp storage
    grant_temp_array = []
    application_temp_array = []
    classification_temp_array = []
    pair_temp_array = []
    legal_temp_array = []

    print('Reading all required links to download and parse ' + time.strftime("%c"))
    logger.info('Reading all required links to download and parse ' + time.strftime("%c"))

    try:
        print('Reading all required grant links ' + time.strftime("%c"))
        logger.info('Reading all required grant links ' + time.strftime("%c"))
        # Read all required grant links into array
        with open(args_array['grant_process_log_file'], "r") as grant_process_file:
            for line in grant_process_file:
                # Ignore empty lines
                if line.strip() != "":
                    # If doing verification, collect processed files that are not verified already
                    if "verify" in args_array['command_args']:
                        if line.strip() != "" and line.split(",")[2].replace("\n", "") == "Processed" and len(line.split(",")) == 3:
                            grant_temp_array.append(line.split(","))
                    # If parsing bulk-data, collect all unprocessed files
                    else:
                        if line.strip() != "" and line.split(",")[2].replace("\n", "") != "Processed":
                            grant_temp_array.append(line.split(","))

        print('Reading all required application links ' + time.strftime("%c"))
        logger.info('Reading all required application links ' + time.strftime("%c"))
        # Read all required applicaton links into array
        with open(args_array['application_process_log_file'], "r") as application_process_file:
            for line in application_process_file:
                # Ignore empty lines
                if line.strip() != "":
                    # If doing verification, collect processed files that are not verified already
                    if "verify" in args_array['command_args']:
                        if line.split(",")[2].replace("\n", "") == "Processed"  and len(line.split(",")) == 3:
                            application_temp_array.append(line.split(","))
                    # If parsing bulk-data, collect all unprocessed files
                    else:
                        if line.split(",")[2].replace("\n", "") != "Processed":
                            application_temp_array.append(line.split(","))

        print('Reading all required classification links ' + time.strftime("%c"))
        logger.info('Reading all required classification links ' + time.strftime("%c"))
        # Read all required classification links into array
        with open(args_array['classification_process_log_file'], "r") as classification_process_file:
            for line in classification_process_file:
                # If doing verification, collect processed files
                if "verify" in args_array['command_args']:
                    if line.split(",")[2].replace("\n", "") == "Processed" and len(line.split(",")) == 3:
                        classification_temp_array.append(line.split(","))
                # If parsing bulk-data, collect all unprocessed files
                else:
                    if line.split(",")[2].replace("\n", "") != "Processed":
                        classification_temp_array.append(line.split(","))

        print('Reading all required PAIR links ' + time.strftime("%c"))
        logger.info('Reading all required PAIR links ' + time.strftime("%c"))
        # Read all required PAIR links into array
        with open(args_array['pair_process_log_file'], "r") as pair_process_file:
            for line in pair_process_file:
                # Ignore empty lines
                if line.strip() != "":
                    # If doing verification, collect processed files
                    if "verify" in args_array['command_args']:
                        if line.split(",")[2].replace("\n", "") == "Processed" and len(line.split(",")) == 3:
                            pair_temp_array.append(line.split(","))
                    # If parsing bulk-data, collect all unprocessed files
                    else:
                        if line.split(",")[2].replace("\n", "") != "Processed":
                            pair_temp_array.append(line.split(","))

        print('Reading all required legal links ' + time.strftime("%c"))
        logger.info('Reading all required legal links ' + time.strftime("%c"))
        # Read all required legal links into array
        with open(args_array['legal_process_log_file'], "r") as legal_process_file:
            for line in legal_process_file:
                # Ignore empty lines
                if line.strip() != "":
                    # If doing verification, collect processed files
                    if "verify" in args_array['command_args']:
                        if line.split(",")[2].replace("\n", "") == "Processed" and len(line.split(",")) == 3:
                            legal_temp_array.append(line.split(","))
                    # If parsing bulk-data, collect all unprocessed files
                    else:
                        if line.split(",")[2].replace("\n", "") != "Processed":
                            legal_temp_array.append(line.split(","))

        print('Finished reading all required links to download and parse ' + time.strftime("%c"))
        logger.info('Finished reading all required links to download and parse ' + time.strftime("%c"))

        # Return the array to main function
        return({
            "grants" : grant_temp_array,
            "applications" : application_temp_array,
            "classifications" : classification_temp_array,
            "PAIR" : pair_temp_array,
            "legal" : legal_temp_array
        })

    except Exception as e:
        print("Failed to get all links from log files " + time.strftime("%c"))
        traceback.print_exc()
        logger.error('Failed to get all links from log files ' + time.strftime("%c"))
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error(str(e) + str(exc_type) + str(fname) + str(exc_tb.tb_lineno))
        return False

# Collect all links, or update with new links to log files
def build_or_update_link_files(args_array):

    logger = logging.getLogger("USPTO_Database_Construction")

    # Check if link log files exists already
    # If not exists, then find and write all links to file
    #TODO: what if only one log file is missing because I deleted it
    if (not os.path.isfile(args_array['grant_process_log_file'])
        or not os.path.isfile(args_array['application_process_log_file'])
        or not os.path.isfile(args_array['classification_process_log_file'])
        or not os.path.isfile(args_array['legal_process_log_file'])
        or not os.path.isfile(args_array['pair_process_log_file'])):

        # If verification is command then files need to be there already.
        # If they are not, exit execution
        if "verify" in args_array['command_args']:
            print("No existing link file lists found. Exiting verification process.  " + time.strftime("%c"))
            logger.info('No existing link file lists found.  Exiting verification process. ' + time.strftime("%c"))
            exit()

        print("No existing link file lists found. Creating them now.  " + time.strftime("%c"))
        logger.info('No existing link file lists found. Creating them now. ' + time.strftime("%c"))

        try:
            # Get List of all links
            all_links_array = USPTOProcessLinks.get_all_links(args_array)
            if args_array['stdout_level'] == 3: print(all_links_array)
            write_link_arrays_to_file(all_links_array, args_array)
        except Exception as e:
            print("Failed to get all links from USPTO bulk data site " + time.strftime("%c"))
            traceback.print_exc()
            logger.error('Failed to get all links from USPTO bulk data site ' + time.strftime("%c"))
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error(str(e) + str(exc_type) + str(fname) + str(exc_tb.tb_lineno))

    # Else if the update arg has been passed then update all links files before starting main function
    elif "update" in args_array['command_args']:

        print("Updating process log files... " + time.strftime("%c"))
        logger.info('Updating process log files... ' + time.strftime("%c"))

        try:
            # Get List of all links and update the existing links based on found links
            all_links_array = USPTOProcessLinks.get_all_links(args_array)
            if args_array['stdout_level'] == 1: print(all_links_array)
            update_link_arrays_to_file(all_links_array, args_array)
        except Exception as e:
            print("Failed to get all links from USPTO bulk data site " + time.strftime("%c"))
            traceback.print_exc()
            logger.error('Failed to get all links from USPTO bulk data site ' + time.strftime("%c"))
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error(str(e) + str(exc_type) + str(fname) + str(exc_tb.tb_lineno))