微博数据爬虫——获取用户基本信息(三)
首先获取page_id
使用正则匹配获取
add = urllib.request.Request(url="https://weibo.com/u/%s?is_all=1" % (o_id), headers=headers) r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8) page_id = re.findall(r$CONFIG[page_id]=(d+), r)[0]
而后通过匹配获取基本信息
add = urllib.request.Request(url="https://weibo.com/p/%s/info" % (page_id), headers=headers) r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8) nums = re.findall(r<strong class=\"W_f.*?\">(d*)<\/strong>, r) regist_time = re.findall(r注册时间:.*?<span class=\"pt_detail\">(.*?)<\/span>, r)[0] regist_time = regist_time.replace(" ","").replace("\r\n","") dic["follow_num"] = nums[0] dic["fun_num"] = nums[1] dic["post_num"] = nums[2] dic["regist_time"] = regist_time
最终代码如下:
import json import re from urllib import request import urllib import config def get_user_action(o_id): dic = {} headers = config.get_headers() add = urllib.request.Request(url="https://weibo.com/u/%s?is_all=1" % (o_id), headers=headers) r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8) page_id = re.findall(r$CONFIG[page_id]=(d+), r)[0] add = urllib.request.Request(url="https://weibo.com/p/%s/info" % (page_id), headers=headers) r = urllib.request.urlopen(url=add, timeout=20).read().decode(utf-8) print("https://weibo.com/p/%s/info" % (page_id)) print(r) nums = re.findall(r<strong class=\"W_f.*?\">(d*)<\/strong>, r) regist_time = re.findall(r注册时间:.*?<span class=\"pt_detail\">(.*?)<\/span>, r)[0] regist_time = regist_time.replace(" ","").replace("\r\n","") dic["follow_num"] = nums[0] dic["fun_num"] = nums[1] dic["post_num"] = nums[2] dic["regist_time"] = regist_time return dic if __name__ == __main__: dic = get_user_action("1906123125") json_f = open("data/data_userinfo.json", "w") json.dump(dic, json_f, indent=4)