dataabc · ydotdog · Mar 24, 2026
diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py
@@ -36,9 +36,11 @@ def download_one_file(self, url, file_path, weibo_id):
                 s = requests.Session()
                 s.mount(url,
                         HTTPAdapter(max_retries=self.file_download_timeout[0]))
+                from ..parser.util import get_proxies
                 downloaded = s.get(url,
                                    timeout=(self.file_download_timeout[1],
-                                            self.file_download_timeout[2]))
+                                            self.file_download_timeout[2]),
+                                   proxies=get_proxies())
                 with open(file_path, 'wb') as f:
                     f.write(downloaded.content)
         except Exception as e:

diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py
@@ -25,31 +25,64 @@ def extract_user_info(self):
                 sys.exit()
             user.nickname = nickname
 
-            basic_info = self.selector.xpath("//div[@class='c'][3]/text()")
             zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人']
             en_list = [
                 'gender', 'location', 'birthday', 'description',
                 'verified_reason', 'talent'
             ]
+
+            # 先尝试标准格式（查看他人资料页）
+            basic_info = self.selector.xpath("//div[@class='c'][3]/text()")
+            has_info = any(
+                ':' in str(i) and str(i).split(':', 1)[0] in zh_list
+                for i in basic_info)
+
+            if not has_info:
+                # 自己查看自己的资料页：标签在<a>标签内，值在<a>的tail文本中
+                basic_info = []
+                for c_div in self.selector.xpath("//div[@class='c']"):
+                    a_texts = c_div.xpath('a/text()')
+                    if u'性别' in a_texts or u'昵称' in a_texts:
+                        for a in c_div.xpath('a'):
+                            label = (a.text or '').strip()
+                            tail = (a.tail or '').strip()
+                            if label in zh_list and tail.startswith(':'):
+                                basic_info.append(label + tail)
+                        break
+
             for i in basic_info:
-                if i.split(':', 1)[0] in zh_list:
-                    setattr(user, en_list[zh_list.index(i.split(':', 1)[0])],
-                            i.split(':', 1)[1].replace('\u3000', ''))
-
-            experienced = self.selector.xpath("//div[@class='tip'][2]/text()") 
-            if experienced and experienced[0] == u'学习经历':
-                user.education = self.selector.xpath(
-                    "//div[@class='c'][4]/text()")[0][1:].replace(
-                        u'\xa0', u' ')
-                if self.selector.xpath(
-                        "//div[@class='tip'][3]/text()")[0] == u'工作经历':
-                    user.work = self.selector.xpath(
-                        "//div[@class='c'][5]/text()")[0][1:].replace(
-                            u'\xa0', u' ')
-            elif experienced and experienced[0] == u'工作经历':
-                user.work = self.selector.xpath(
-                    "//div[@class='c'][4]/text()")[0][1:].replace(
-                        u'\xa0', u' ')
+                if ':' in str(i) and str(i).split(':', 1)[0] in zh_list:
+                    setattr(user, en_list[zh_list.index(str(i).split(':', 1)[0])],
+                            str(i).split(':', 1)[1].replace('\u3000', ''))
+
+            # 提取学习经历和工作经历，使用following-sibling定位，兼容自己和他人页面
+            tip_divs = self.selector.xpath("//div[@class='tip']")
+            for tip in tip_divs:
+                tip_text = tip.xpath('string(.)').strip()
+                if tip_text == u'学习经历':
+                    edu_div = tip.xpath(
+                        'following-sibling::div[@class="c"][1]')
+                    if edu_div:
+                        # 优先用text()（他人页面），fallback用string(.)（自己页面）
+                        edu_text = edu_div[0].xpath('text()')
+                        if edu_text and len(edu_text[0].strip()) > 1:
+                            user.education = edu_text[0][1:].replace(
+                                u'\xa0', u' ')
+                        else:
+                            user.education = ' '.join(
+                                edu_div[0].xpath('string(.)').split())
+                elif tip_text == u'工作经历':
+                    work_div = tip.xpath(
+                        'following-sibling::div[@class="c"][1]')
+                    if work_div:
+                        work_text = work_div[0].xpath('text()')
+                        if work_text and len(work_text[0].strip()) > 1:
+                            user.work = work_text[0][1:].replace(
+                                u'\xa0', u' ')
+                        else:
+                            user.work = ' '.join(
+                                work_div[0].xpath('string(.)').split())
+
             return user
         except Exception as e:
             logger.exception(e)
diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py
@@ -91,6 +91,7 @@ def get_one_page(self, weibo_id_list):
             return weibos, weibo_id_list, self.to_continue
         except Exception as e:
             logger.exception(e)
+            return [], weibo_id_list, self.to_continue
 
     def is_original(self, info):
         """判断微博是否为原创微博"""

diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py
@@ -12,37 +12,62 @@
 URL_MAP_FILE = 'url_map.json'
 logger = logging.getLogger('spider.util')
 
+# 全局代理配置，由 spider.py 初始化
+_proxies = None
+
+
+def set_proxies(proxy_url):
+    """设置全局代理"""
+    global _proxies
+    if proxy_url:
+        _proxies = {'http': proxy_url, 'https': proxy_url}
+        logger.info(u'已启用代理: %s', proxy_url)
+
+
+def get_proxies():
+    return _proxies
+
 
 def hash_url(url):
     return hashlib.sha224(url.encode('utf8')).hexdigest()
 
 
+DEFAULT_UA = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
+              'AppleWebKit/537.36 (KHTML, like Gecko) '
+              'Chrome/133.0.0.0 Safari/537.36')
+
+
 def handle_html(cookie, url):
     """处理html"""
-    try:
-        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
-        headers = {'User_Agent': user_agent, 'Cookie': cookie}
-        resp = requests.get(url, headers=headers)
-
-        if GENERATE_TEST_DATA:
-            import io
-            import os
-
-            resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url))
-            with io.open(resp_file, 'w', encoding='utf-8') as f:
-                f.write(resp.text)
-
-            with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f:
-                url_map = json.loads(f.read())
-                url_map[url] = resp_file
-                f.seek(0)
-                f.write(json.dumps(url_map, indent=4, ensure_ascii=False))
-                f.truncate()
-
-        selector = etree.HTML(resp.content)
-        return selector
-    except Exception as e:
-        logger.exception(e)
+    from time import sleep
+    headers = {'User-Agent': DEFAULT_UA, 'Cookie': cookie}
+    for attempt in range(5):
+        try:
+            resp = requests.get(url, headers=headers, timeout=10,
+                                proxies=_proxies)
+            if resp.status_code == 200 and len(resp.content) > 0:
+                selector = etree.HTML(resp.content)
+                return selector
+            elif resp.status_code == 403:
+                wait = 300 * (attempt + 1)
+                logger.warning(u'403 IP被限制，等待%d秒后重试(第%d次)',
+                               wait, attempt + 1)
+                sleep(wait)
+            elif resp.status_code == 432:
+                logger.error(u'432 User-Agent被拒绝，请更新UA')
+                return None
+            else:
+                wait = 60 * (attempt + 1)
+                logger.warning(u'请求返回状态码%d，等待%d秒后重试(第%d次)',
+                               resp.status_code, wait, attempt + 1)
+                sleep(wait)
+        except Exception as e:
+            wait = 60 * (attempt + 1)
+            logger.warning(u'请求异常，等待%d秒后重试(第%d次): %s',
+                           wait, attempt + 1, str(e))
+            sleep(wait)
+    logger.error(u'请求%s失败，已重试5次', url)
+    return None
 
 
 def handle_garbled(info):
@@ -95,9 +120,9 @@ def to_video_download_url(cookie, video_page_url):
     video_object_url = video_page_url.replace('m.weibo.cn/s/video/show',
                                               'm.weibo.cn/s/video/object')
     try:
-        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
-        headers = {'User_Agent': user_agent, 'Cookie': cookie}
-        wb_info = requests.get(video_object_url, headers=headers).json()
+        headers = {'User-Agent': DEFAULT_UA, 'Cookie': cookie}
+        wb_info = requests.get(video_object_url, headers=headers,
+                               proxies=_proxies).json()
         video_url = wb_info['data']['object']['stream'].get('hd_url')
         if not video_url:
             video_url = wb_info['data']['object']['stream']['url']

diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py
@@ -385,6 +385,11 @@ def main(_):
     try:
         config = _get_config()
         config_util.validate_config(config)
+        # 初始化代理
+        proxy = config.get('proxy')
+        if proxy:
+            from .parser.util import set_proxies
+            set_proxies(proxy)
         wb = Spider(config)
         wb.start()  # 爬取微博信息
     except Exception as e: