发现某站点文章很多,爬取所有文章名和链接,并保存在txt文档中,方便后续查看
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import
urllib,urllib2,re,requests
import
sys
reload
(sys)
sys.setdefaultencoding(
'utf-8'
)
domain
=
[
'http://linux.linuxidc.com/'
]
name_url
=
[]
#一级页面url name
name_url2
=
[]
#二级页面url name
name_url3
=
[]
#三级页面url name
name_url4
=
[]
#四级页面url name
def
get():
hd
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
url
=
'http://linux.linuxidc.com/index.php'
html
=
requests.get(url,headers
=
hd).text
#print html
url_content
=
re.
compile
(r
'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)'
,re.S)
#编译
url_contents
=
re.findall(url_content,html)
#匹配页面
#print url_contents
for
i
in
url_contents:
url_reg
=
re.
compile
(r
'<a href="(.*?)">'
)
#过滤资料链接
name_reg
=
re.
compile
(r
'<a href=".*?">(.*?)</a></div>'
)
#过滤资料名称
url_items
=
re.findall(url_reg,i)
name_items
=
re.findall(name_reg,i)
#print name_items[0]
#拼接地址链接
url
=
domain
+
url_items
url_items
=
[''.join(url)]
#print url_items[0]
for
i,v
in
zip
(name_items,url_items):
name_url.append([i,v])
#print i,v
for
j
in
name_url:
#j[0]=name j[1]=url
if
j[
1
]
=
=
'http://linux.linuxidc.com/index.php?folder=cHVi'
:
#忽略pub目录
continue
elif
j[
1
]
=
=
'http://linux.linuxidc.com/index.php?folder=MjAxMcTq18rBzw=='
:
# 忽略2011资料目录
continue
else
:
#获取其他目录
#print i[0]
html2
=
requests.get(j[
1
], headers
=
hd).text
# print html2
url_content2
=
re.
compile
(r
'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)'
,re.S)
# 编译
url_contents2
=
re.findall(url_content, html2)
# 匹配二级页面
#print url_contents2
for
p
in
url_contents2:
url_reg2
=
re.
compile
(r
'<a href="(.*?)">'
)
# 过滤二级页面资料链接
name_reg2
=
re.
compile
(r
'<a href=".*?">(.*?)</a></div>'
)
# 过滤二级页面资料名称
url_items2
=
re.findall(url_reg2, p)
name_items2
=
re.findall(name_reg2, p)
#print name_items2,url_items2
#拼接地址链接
url2
=
domain
+
url_items2
url_items2
=
[''.join(url2)]
#print name_items2[0],url_items2[0]
for
m,n
in
zip
(name_items2,url_items2):
name_url2.append([m,n])
#print m,n
for
k
in
name_url2:
#k[0]=name k[1]=url
html3
=
requests.get(k[
1
], headers
=
hd).text
#print html3
url_content3
=
re.
compile
(r
'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)'
,re.S)
# 编译
url_contents3
=
re.findall(url_content3,html3)
#匹配三级页面
#print url_contents3
for
p
in
url_contents3:
url_reg3
=
re.
compile
(r
'<a href="(.*?)">'
)
#过滤三级页面资料链接
name_reg3
=
re.
compile
(r
'<a href=".*?">(.*?)</a></div>'
)
# 过滤三级页面资料名称
url_items3
=
re.findall(url_reg3, p)
name_items3
=
re.findall(name_reg3, p)
#print name_items3,url_items3
# 拼接地址链接
url3
=
domain
+
url_items3
url_items3
=
[''.join(url3)]
#print name_items3[0],url_items3[0]
for
m, n
in
zip
(name_items3, url_items3):
name_url3.append([m, n])
#print m,n
for
l
in
name_url3:
#l[0]=name l[1]=url
html4
=
requests.get(l[
1
],headers
=
hd).text
#print html4
url_content4
=
re.
compile
(r
'(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)'
,re.S)
# 编译
url_contents4
=
re.findall(url_content4, html4)
# 匹配四级页面
# print url_contents4
for
p
in
url_contents4:
url_reg4
=
re.
compile
(r
'<a href="(.*?)">'
)
# 过滤四级页面资料链接
name_reg4
=
re.
compile
(r
'<a href=".*?">(.*?)</a></div>'
)
# 过滤四级页面资料名称
url_items4
=
re.findall(url_reg4, p)
name_items4
=
re.findall(name_reg4, p)
# print name_items4,url_items4
# 拼接地址链接
url4
=
domain
+
url_items4
url_items4
=
[''.join(url4)]
# print name_items4[0],url_items4[0]
for
m, n
in
zip
(name_items4, url_items4):
name_url4.append([m, n])
f
=
open
(
'get_list.txt'
,
'a+'
)
print
"正在保存--%s"
%
m
print
>> f,
"%s,%s"
%
(m,n)
if
__name__
=
=
"__main__"
:
get()
|
执行过程:
在脚本文件同路径下会生成保存的文件:
文件内容:文章标题,文章链接
报错:
requests.exceptions.ConnectionError: HTTPConnectionPool(host='linux.linuxidc.com', port=80): Max retries exceeded with url: /index.php?folder=MjAxN8Tq18rBzy8z1MIvMjXI1Q== (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x0000000002B6D198>: Failed to establish a new connection: [Errno 10060] ',))
原因:http连接太多没有关闭导致
解决:使用requests的session客户端模式和保持长连接的状态
1
2
3
4
5
|
#定义
request
=
requests.Session()
#代码中全部替换为
html
=
request.get(url,headers
=
hd).text
|
本文转自M四月天 51CTO博客,原文链接:http://blog.51cto.com/msiyuetian/1929710,如需转载请自行联系原作者