微博数据爬虫——获取用户基本信息(三)

首先获取page_id

使用正则匹配获取

add = urllib.request.Request(url="https://weibo.com/u/%s?is_all=1" % (o_id), headers=headers)
r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8)
page_id = re.findall(r$CONFIG[page_id]=(d+), r)[0]

而后通过匹配获取基本信息

add = urllib.request.Request(url="https://weibo.com/p/%s/info" % (page_id), headers=headers)
r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8)
nums = re.findall(r<strong class=\"W_f.*?\">(d*)<\/strong>, r)
regist_time  = re.findall(r注册时间:.*?<span class=\"pt_detail\">(.*?)<\/span>, r)[0]
regist_time = regist_time.replace(" ","").replace("\r\n","")
dic["follow_num"] = nums[0]
dic["fun_num"] = nums[1]
dic["post_num"] = nums[2]
dic["regist_time"] = regist_time

最终代码如下:

import json
import re
from urllib import request
import urllib
import config


def get_user_action(o_id):
    dic = {}
    headers = config.get_headers()
    add = urllib.request.Request(url="https://weibo.com/u/%s?is_all=1" % (o_id), headers=headers)
    r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8)
    page_id = re.findall(r$CONFIG[page_id]=(d+), r)[0]
    add = urllib.request.Request(url="https://weibo.com/p/%s/info" % (page_id), headers=headers)
    r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8)
    print("https://weibo.com/p/%s/info" % (page_id))
    print(r)
    nums = re.findall(r<strong class=\"W_f.*?\">(d*)<\/strong>, r)
    regist_time  = re.findall(r注册时间:.*?<span class=\"pt_detail\">(.*?)<\/span>, r)[0]
    regist_time = regist_time.replace(" ","").replace("\r\n","")
    dic["follow_num"] = nums[0]
    dic["fun_num"] = nums[1]
    dic["post_num"] = nums[2]
    dic["regist_time"] = regist_time
    return dic

if __name__ == __main__:
    dic = get_user_action("1906123125")
    json_f = open("data/data_userinfo.json", "w")
    json.dump(dic, json_f, indent=4)
经验分享 程序员 微信小程序 职场和发展