-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathflora3.py
More file actions
103 lines (84 loc) · 2.67 KB
/
flora3.py
File metadata and controls
103 lines (84 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# テスト
# 新規作成 2024/3/3
# Create by 乃木坂好きのITエンジニア
# ライブラリのインポート
import re
import openpyxl as op
import shutil
import os
#項目を取得
pattern1 = '<td class="title">'
#内容を取得
patturn2 = '<td class="content">'
patturn3 = '</td>'
patturn4 = '<br/>'
# pタグを取得
patturn5 = '<p>'
# データ切り替えレコード変数の定義
record1 = '<td class="content">'
record2 = '<td class="title">事業内容</td>'
# 入力ファイル
input_file = "flora.txt"
# テンプレートエクセルファイル
template_excel_file = "会社概要.xlsx"
# 会社名、ホールディング名判定カウント
w_count = 0
# pタグ会社名、ホールディング名判定カウント
w_p_count = 0
w_p_company_content = ""
w_p_holding_content = ""
# タイトルのデータクレンジング関数
def title_cleansing(line):
w_array1 = line.split(">")
w_item = w_array1[1]
w_item = w_item.replace('</td',"")
return w_item
# 会社内容のデータクレンジング関数
def content_cleansing(line):
w_array2 = line.split(">")
w_content = w_array2[1]
w_content = w_content.replace('<a href="',"")
w_content = w_content.replace('tel:',"")
w_content = w_content.replace('mailto:',"")
w_content = w_content.replace('</td',"")
w_content = w_content.replace('<br/',"")
w_content = w_content.replace('"',"")
return w_content
# pタグ内容のデータクレンジング関数
def p_tag_cleansing(line):
w_array3 = line.split(">")
w_p_content = w_array3[1]
w_p_content = w_p_content.replace('</p',"")
w_p_content = w_p_content.replace('・'," ")
return w_p_content
# メイン処理
#タグ抽出ファイルを開く
file_data = open(input_file,"r",encoding="utf-8")
for line in file_data:
line = line.replace("\n","")
result1 = re.match(pattern1,line)
result2 = re.match(patturn2,line)
result3 = re.search(patturn3,line)
result4 = re.search(patturn4,line)
result5 = re.search(patturn5,line)
if line == record1:
w_p_count += 1
if line == record2:
w_p_count = 0
if result1:
w_count = 0
w_item = title_cleansing(line)
if (result2 and result3) or (result2 and result4):
w_content = content_cleansing(line)
if w_count == 0:
w_company_content = w_content
elif w_count == 1:
w_holding_content = w_content
w_count += 1
if result5:
w_p_content = p_tag_cleansing(line)
print(w_p_content)
if w_p_count == 1:
w_p_company_content += w_p_content
elif w_p_count == 2:
w_p_holding_content += w_p_content