我的第一个网页爬虫

本人比较爱看小说,所以那那小说网练了一下手…

爬取小说网站

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#-*- coding: UTF-8 -*-
#第一个爬虫
#爬取www.55x.cn的小说
#By Js


from bs4 import BeautifulSoup
import requests
#解决输出dict中文问题
import json
import re
#重定向文件时编码问题
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#爬取www.55x.cn玄幻主页面
url = "http://www.55x.cn/html/xuanhuan/"
wb_data = requests.get(url)
wb_data.encoding = 'gbk'
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_g > a')
sizes = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_l')
summaries = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_j')
srcs = soup.select('body > div.d1 > div.down > div.xiazai > div.xiashu > ul > li.qq_g > a')



for title,size,summary,src in zip(titles, sizes, summaries, srcs):
#取每个记录装入字典
data = {
'title': title.get_text(),
'size': size.get_text(),
'summary':summary.get_text(),
'src':"http://www.55x.cn" + src.get('href'),
}

#文件大小转换成数字便与筛选
size1 = re.findall(r"\d+\.\d+",data['size'])
if len(size1)==0:
data['size'] = 0
else:
data['size'] = float(size1[0])
#爬入每条记录读取文章完整简介
url = data['src']
wb_data = requests.get(url)
wb_data.encoding = 'gbk'
soup = BeautifulSoup(wb_data.text, 'lxml')
summary1 = soup.select('body > div.d1 > div.down > div.xiazai > div.zhangjie > p')
downloads = soup.select('body > div.d1 > div.down > div.xiazai > div.xiaye > a:nth-of-type(1)')
if len(summary1) != 0:
data['summary'] = summary1[0].get_text()
#爬入下载页面
url = "http://www.55x.cn" + downloads[0].get('href')
wb_data = requests.get(url)
wb_data.encoding = 'gbk'
soup = BeautifulSoup(wb_data.text, 'lxml')
download = soup.select('body > div.d1 > div.down > div.xiazai > div.xinxi > div.shuji > ul > li:nth-of-type(1) > a')
#爬取下载链接存入字典
url = "http://www.55x.cn" + download[0].get('href')
data['src'] = url

print json.dumps(data, ensure_ascii=False, encoding='UTF-8')