爬取94神马网的电影信息
1.程序如下
import requests
from lxml import etree
import json
Base_download=http://www.9rmb.com#后期每一电影的拼接基础
headers={
User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36}
def spider():
base_url=http://www.9rmb.com/type/1/{}.html#每一页电影的基础
movies=[]
for num in range(1,8):
join_url=base_url.format(num)#进行电影地址的拼接
detail_urls=get_detail_urls(join_url)#调用每一个电影的网址
print(detail_urls)
for urls in detail_urls:
#对每一个电影进行提取
movie=parse_detail_page(urls)
movies.append(movie)
print(movie)
print(type(movie))
#将获取到信息放入到文件中
with open(11.txt,a,encoding=utf-8)as f:
json.dump(movie,f,ensure_ascii=False)#最后一个为了获得是utf8
f.write(
)
def get_detail_urls(urls):
#蒋电影网页中的每一电影连接获取到并返回给调用函数
r1=requests.get(url=urls,headers=headers)
r1_element=etree.HTML(r1.text)
detail_url=r1_element.xpath(//div[@class="movie-item"]/a/@href)
detail_urls=map(lambda url:Base_download+url,detail_url)#得到一个生成器
return detail_urls
def parse_detail_page(urls):
#进行电影目的的获取
movie={
}
resp=requests.get(urls,headers=headers).content.decode(utf-8,ignore)
resp_element=etree.HTML(resp)
title=resp_element.xpath(//div[@class="col-md-12"]/h1/text())[0]#获取电影名称
movie[title]=title
main_actors=resp_element.xpath(//td[@id="casts"]/text())[0]
movie[actors]=main_actors#获取电影演员
coutry=resp_element.xpath(//tr[4]/td[2]/text())[0]
movie[country]=coutry#获取电影的国家
evaluate=resp_element.xpath(//a[@class="score"]/text())[0]
movie[evaluate]=evaluate #获取电影评价
return movie
if __name__==__main__:
spider()
f=open(11.txt,w,encoding=utf-8)
f.close()
