from bs4 import BeautifulSoup
import requests
def get_url(key,pagenum):
url = 'https://www.baidu.com/s?wd='+key+'&pn='+pagenum
head = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
req = requests.get(url,headers=head).text
#print(req)
response = BeautifulSoup(req,"html.parser")
#print(response)
resq =response.findAll('div',{'class':'f13'})
link = []
urllink = []
for i in resq:
try:
if i.find('a',{'target':'_blank'})['href'].startswith("http://"):
link.append(i.find('a',{'target':'_blank'})['href'])
except:
print('>>>有一条出错啦')
for i in link:
page = requests.get(i,headers=head,allow_redirects=False)
if page.status_code == 200:
u = re.search(r'URL=\'(.*?)\'', tmpPage.text.encode('utf-8'), re.S)
urllink.append(u.group(1))
elif page.status_code == 302:
u = page.headers.get('location')
urllink.append(u)
else:
print('解析出错啦')
for i in urllink:
print(i)
key = str(input('请输入要查询的内容'))
pagenum = int(input('请输入要查询的页数'))
pagenum -= 1
pagenum = pagenum*10
for i in range(0,pagenum+1,10):
i = str(i)
get_url(key=key,pagenum=i)