Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion weibo_spider/downloader/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ def download_one_file(self, url, file_path, weibo_id):
s = requests.Session()
s.mount(url,
HTTPAdapter(max_retries=self.file_download_timeout[0]))
from ..parser.util import get_proxies
downloaded = s.get(url,
timeout=(self.file_download_timeout[1],
self.file_download_timeout[2]))
self.file_download_timeout[2]),
proxies=get_proxies())
with open(file_path, 'wb') as f:
f.write(downloaded.content)
except Exception as e:
Expand Down
71 changes: 52 additions & 19 deletions weibo_spider/parser/info_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,64 @@ def extract_user_info(self):
sys.exit()
user.nickname = nickname

basic_info = self.selector.xpath("//div[@class='c'][3]/text()")
zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人']
en_list = [
'gender', 'location', 'birthday', 'description',
'verified_reason', 'talent'
]

# 先尝试标准格式(查看他人资料页)
basic_info = self.selector.xpath("//div[@class='c'][3]/text()")
has_info = any(
':' in str(i) and str(i).split(':', 1)[0] in zh_list
for i in basic_info)

if not has_info:
# 自己查看自己的资料页:标签在<a>标签内,值在<a>的tail文本中
basic_info = []
for c_div in self.selector.xpath("//div[@class='c']"):
a_texts = c_div.xpath('a/text()')
if u'性别' in a_texts or u'昵称' in a_texts:
for a in c_div.xpath('a'):
label = (a.text or '').strip()
tail = (a.tail or '').strip()
if label in zh_list and tail.startswith(':'):
basic_info.append(label + tail)
break

for i in basic_info:
if i.split(':', 1)[0] in zh_list:
setattr(user, en_list[zh_list.index(i.split(':', 1)[0])],
i.split(':', 1)[1].replace('\u3000', ''))

experienced = self.selector.xpath("//div[@class='tip'][2]/text()")
if experienced and experienced[0] == u'学习经历':
user.education = self.selector.xpath(
"//div[@class='c'][4]/text()")[0][1:].replace(
u'\xa0', u' ')
if self.selector.xpath(
"//div[@class='tip'][3]/text()")[0] == u'工作经历':
user.work = self.selector.xpath(
"//div[@class='c'][5]/text()")[0][1:].replace(
u'\xa0', u' ')
elif experienced and experienced[0] == u'工作经历':
user.work = self.selector.xpath(
"//div[@class='c'][4]/text()")[0][1:].replace(
u'\xa0', u' ')
if ':' in str(i) and str(i).split(':', 1)[0] in zh_list:
setattr(user, en_list[zh_list.index(str(i).split(':', 1)[0])],
str(i).split(':', 1)[1].replace('\u3000', ''))

# 提取学习经历和工作经历,使用following-sibling定位,兼容自己和他人页面
tip_divs = self.selector.xpath("//div[@class='tip']")
for tip in tip_divs:
tip_text = tip.xpath('string(.)').strip()
if tip_text == u'学习经历':
edu_div = tip.xpath(
'following-sibling::div[@class="c"][1]')
if edu_div:
# 优先用text()(他人页面),fallback用string(.)(自己页面)
edu_text = edu_div[0].xpath('text()')
if edu_text and len(edu_text[0].strip()) > 1:
user.education = edu_text[0][1:].replace(
u'\xa0', u' ')
else:
user.education = ' '.join(
edu_div[0].xpath('string(.)').split())
elif tip_text == u'工作经历':
work_div = tip.xpath(
'following-sibling::div[@class="c"][1]')
if work_div:
work_text = work_div[0].xpath('text()')
if work_text and len(work_text[0].strip()) > 1:
user.work = work_text[0][1:].replace(
u'\xa0', u' ')
else:
user.work = ' '.join(
work_div[0].xpath('string(.)').split())

return user
except Exception as e:
logger.exception(e)
1 change: 1 addition & 0 deletions weibo_spider/parser/page_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def get_one_page(self, weibo_id_list):
return weibos, weibo_id_list, self.to_continue
except Exception as e:
logger.exception(e)
return [], weibo_id_list, self.to_continue

def is_original(self, info):
"""判断微博是否为原创微博"""
Expand Down
79 changes: 52 additions & 27 deletions weibo_spider/parser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,37 +12,62 @@
URL_MAP_FILE = 'url_map.json'
logger = logging.getLogger('spider.util')

# 全局代理配置,由 spider.py 初始化
_proxies = None


def set_proxies(proxy_url):
"""设置全局代理"""
global _proxies
if proxy_url:
_proxies = {'http': proxy_url, 'https': proxy_url}
logger.info(u'已启用代理: %s', proxy_url)


def get_proxies():
return _proxies


def hash_url(url):
return hashlib.sha224(url.encode('utf8')).hexdigest()


DEFAULT_UA = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/133.0.0.0 Safari/537.36')


def handle_html(cookie, url):
"""处理html"""
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User_Agent': user_agent, 'Cookie': cookie}
resp = requests.get(url, headers=headers)

if GENERATE_TEST_DATA:
import io
import os

resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url))
with io.open(resp_file, 'w', encoding='utf-8') as f:
f.write(resp.text)

with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f:
url_map = json.loads(f.read())
url_map[url] = resp_file
f.seek(0)
f.write(json.dumps(url_map, indent=4, ensure_ascii=False))
f.truncate()

selector = etree.HTML(resp.content)
return selector
except Exception as e:
logger.exception(e)
from time import sleep
headers = {'User-Agent': DEFAULT_UA, 'Cookie': cookie}
for attempt in range(5):
try:
resp = requests.get(url, headers=headers, timeout=10,
proxies=_proxies)
if resp.status_code == 200 and len(resp.content) > 0:
selector = etree.HTML(resp.content)
return selector
elif resp.status_code == 403:
wait = 300 * (attempt + 1)
logger.warning(u'403 IP被限制,等待%d秒后重试(第%d次)',
wait, attempt + 1)
sleep(wait)
elif resp.status_code == 432:
logger.error(u'432 User-Agent被拒绝,请更新UA')
return None
else:
wait = 60 * (attempt + 1)
logger.warning(u'请求返回状态码%d,等待%d秒后重试(第%d次)',
resp.status_code, wait, attempt + 1)
sleep(wait)
except Exception as e:
wait = 60 * (attempt + 1)
logger.warning(u'请求异常,等待%d秒后重试(第%d次): %s',
wait, attempt + 1, str(e))
sleep(wait)
logger.error(u'请求%s失败,已重试5次', url)
return None


def handle_garbled(info):
Expand Down Expand Up @@ -95,9 +120,9 @@ def to_video_download_url(cookie, video_page_url):
video_object_url = video_page_url.replace('m.weibo.cn/s/video/show',
'm.weibo.cn/s/video/object')
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User_Agent': user_agent, 'Cookie': cookie}
wb_info = requests.get(video_object_url, headers=headers).json()
headers = {'User-Agent': DEFAULT_UA, 'Cookie': cookie}
wb_info = requests.get(video_object_url, headers=headers,
proxies=_proxies).json()
video_url = wb_info['data']['object']['stream'].get('hd_url')
if not video_url:
video_url = wb_info['data']['object']['stream']['url']
Expand Down
5 changes: 5 additions & 0 deletions weibo_spider/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,11 @@ def main(_):
try:
config = _get_config()
config_util.validate_config(config)
# 初始化代理
proxy = config.get('proxy')
if proxy:
from .parser.util import set_proxies
set_proxies(proxy)
wb = Spider(config)
wb.start() # 爬取微博信息
except Exception as e:
Expand Down