我的第一个网页爬虫 发表于 2016-10-10 | 本文总浏览量 次 本人比较爱看小说,所以那那小说网练了一下手… 爬取小说网站123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263#-*- coding: UTF-8 -*-#第一个爬虫#爬取www.55x.cn的小说#By Jsfrom bs4 import BeautifulSoupimport requests#解决输出dict中文问题import jsonimport re#重定向文件时编码问题import sysreload(sys)sys.setdefaultencoding('utf-8')#爬取www.55x.cn玄幻主页面url = "http://www.55x.cn/html/xuanhuan/"wb_data = requests.get(url)wb_data.encoding = 'gbk'soup = BeautifulSoup(wb_data.text, 'lxml')titles = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_g > a')sizes = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_l')summaries = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_j')srcs = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_g > a')for title,size,summary,src in zip(titles, sizes, summaries, srcs): #取每个记录装入字典 data = { 'title': title.get_text(), 'size': size.get_text(), 'summary':summary.get_text(), 'src':"http://www.55x.cn" + src.get('href'), } #文件大小转换成数字便与筛选 size1 = re.findall(r"\d+\.\d+",data['size']) if len(size1)==0: data['size'] = 0 else: data['size'] = float(size1[0]) #爬入每条记录读取文章完整简介 url = data['src'] wb_data = requests.get(url) wb_data.encoding = 'gbk' soup = BeautifulSoup(wb_data.text, 'lxml') summary1 = soup.select('body > div.d1 > div.down > div.xiazai > div.zhangjie > p') downloads = soup.select('body > div.d1 > div.down > div.xiazai > div.xiaye > a:nth-of-type(1)') if len(summary1) != 0: data['summary'] = summary1[0].get_text() #爬入下载页面 url = "http://www.55x.cn" + downloads[0].get('href') wb_data = requests.get(url) wb_data.encoding = 'gbk' soup = BeautifulSoup(wb_data.text, 'lxml') download = soup.select('body > div.d1 > div.down > div.xiazai > div.xinxi > div.shuji > ul > li:nth-of-type(1) > a') #爬取下载链接存入字典 url = "http://www.55x.cn" + download[0].get('href') data['src'] = url print json.dumps(data, ensure_ascii=False, encoding='UTF-8')