淘宝信息定向爬取-嵩天老师慕课学习
`
```python #引用两个库,requests库用于获取网页的源代码,re库用于利用正则表达式对网页源代码中的内容进行查找等功能 import requests import re #获取网页的源代码,在访问淘宝网时,需要对浏览器进行伪装,则要用到user-agent和cookies,最后返回网页源代码 def getHTMLText(url): kv={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0} kv1={ Cookies:cna=WePJE9dNsG8CAX1aMoWosyt/; tracknick=%5Cu4E91%5Cu98D8%5Cu67AB%5Cu98DE; tg=0; miid=9098228591430533745; thw=cn; UM_distinctid=16e11648b7530c-059a73ecbd15b9-7373e61-100200-16e11648b7644f; t=a0f8a792c27da51877f74e18adee0bf7; hng=CN%7Czh-CN%7CCNY%7C156; enc=aF%2F9o28pQJ90155ymsN5Jh3qtl9omT%2BxiElFY6rIY7%2Bq%2FX82JxOKmo6HSPik8iyJyIc1BjYkUpjXTRyY4DDX%2Bg%3D%3D; v=0; cookie2=1271eefa423fdc842daae96a46336aa7; _tb_token_=38e03f7379de3; alitrackid=www.taobao.com; _samesite_flag_=true; sgcookie=EJU6pLPr70e6XDekO6SZU; unb=3018381355; uc3=vt3=F8dBxd9jAg5D9bgMhOQ%3D&id2=UNDVegNV%2FUSevg%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&nk2=txBcODbldeo%3D; csg=7882d66b; lgc=%5Cu4E91%5Cu98D8%5Cu67AB%5Cu98DE; cookie17=UNDVegNV%2FUSevg%3D%3D; dnk=%5Cu4E91%5Cu98D8%5Cu67AB%5Cu98DE; skt=330e918f3abd7238; existShop=MTU4NDk1MDkzOQ%3D%3D; uc4=id4=0%40UgclF6ybLCfOwjJcS3ysIYfj4czd&nk4=0%40tWMYrd68JfYbBDG4Mn76tkntdA%3D%3D; _cc_=WqG3DMC9EA%3D%3D; _l_g_=Ug%3D%3D; sg=%E9%A3%9E5e; _nk_=%5Cu4E91%5Cu98D8%5Cu67AB%5Cu98DE; cookie1=BvWwiwQx5MufwJQyDndhKBbnIbi1FupuoNHZefI%2BBkw%3D; lastalitrackid=login.taobao.com; tfstk=cjCRByxZhoqkTBgI7P2DOY7eizEGZOwJxoxZ9y1PwdRTC3cdiI5G_6vniFnJMrC..; mt=ci=9_1; JSESSIONID=F98166E2C7BA7996DE90BB8D35E2F379; uc1=cookie14=UoTUPvncF6SwdA%3D%3D&lng=zh_CN&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&existShop=false&cookie21=VT5L2FSpczFp&tag=8&cookie15=UtASsssmOIJ0bQ%3D%3D&pas=0; isg=BAEBdwzvOyh82lIpcXqd5-wNEE0bLnUgVVXOWWNWl4hnSiEcq36F8C9DKL4MxA1Y; l=dBIF5tdrvfcHBmw_BOCZCVz_CT_T-IRYoukaCBwDi_5KZ6L_Lq_OoPhiNFp6cjWf9zLB4aVvW6J9-etkwQHmndHgcGAN1xDc.} try: r=requests.get(url,headers=kv,cookies=kv1,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: print("1?") #从网页源代码中解析出商品价格以及商品的名称,存入到一个列表里 def parsePage(ilt,html): try: plt=re.findall(r"view_price":"[d.]*",html)#r:表示里面的内容是原生字符串,根据源代码显示, tlt=re.findall(r"raw_title":".*?",html)#我们需要的数据在“view_price:价格”,“raw_title:名字”这两个键值对当中,因此通过findall分别返回一个列表 for i in range(len(plt)): price=eval(plt[i].split(:)[1])#通过循环,获取每一个价格与商品名,.split(:)[1]将列表以:分割成两个列表并取第二个列表 #print(price) title=eval(tlt[i].split(:)[1])#eval()函数则将列表中的“”去除,仅输出数值 #print(title) ilt.append([price,title])#将价格与商品名添加到同一个列表中 except: print("2?") #将获得到的商品价格与商品名称进行打印 def printGoodList(ilt): tplt="{:4} {:8} {:16}" print(tplt.format("序号","价格","商品名称")) count=1 for g in ilt: print(tplt.format(count,g[0],g[1])) count=count+1 def main(): goods=书包 depth=2 start_url=https://s.taobao.com/search?q=+goods infoList=[] for i in range(depth): try: url=start_url+&s=+str(44*i) html=getHTMLText(url) parsePage(infoList,html) except: print("3?") continue printGoodList(infoList) if __name__ == __main__: main()