百度敏感词搜索脚本

Posted by bfpiaoran on August 12, 2017
from bs4 import BeautifulSoup
import requests
 
 
def get_url(key,pagenum):
 
    url = 'https://www.baidu.com/s?wd='+key+'&pn='+pagenum
 
    head = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
    req = requests.get(url,headers=head).text
    #print(req)
    response = BeautifulSoup(req,"html.parser")
    #print(response)
    resq =response.findAll('div',{'class':'f13'})
    link = []
    urllink = []
    for i in resq:
        try:
            if i.find('a',{'target':'_blank'})['href'].startswith("http://"):
                link.append(i.find('a',{'target':'_blank'})['href'])
        except:
            print('>>>有一条出错啦')
    for i in link:
        page = requests.get(i,headers=head,allow_redirects=False)
        if page.status_code == 200:
            u = re.search(r'URL=\'(.*?)\'', tmpPage.text.encode('utf-8'), re.S)
            urllink.append(u.group(1))
        elif page.status_code == 302:
            u = page.headers.get('location')
            urllink.append(u)
        else:
            print('解析出错啦')
    for i in urllink:
        print(i)
 
 
key = str(input('请输入要查询的内容'))
pagenum = int(input('请输入要查询的页数'))
pagenum -= 1
pagenum = pagenum*10
 
for i in range(0,pagenum+1,10):
    i = str(i)
    get_url(key=key,pagenum=i)