背景:利用爬虫,爬取网站页面广告元素,监控爬取元素的数目,定时发送监控邮件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- ''' @xiayun @896365105@qq.com #爬取网站内容,利用phantomjs:IP代理+修改UA+动态页面执行JS ''' from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import urllib,urllib2
import smtplib
import re
from email.mime.text import MIMEText
from email.header import Header
import sys
def reptile():
global result, data
#proxy_ip.txt为IP代理池,可以自己爬IP,也可以买,不过都不稳定,
#需要在前面再加一个IP验证程序。
IPS = [i for i in open ( "./proxy_ip.txt" , 'r' ).readline().split( '\n' ) if i]
print IPS
for i in IPS:
service_args = []
service_args = [ '--proxy-type=HTTP' ,]
IP_str = ''.join(i)
print IP_str
proxy_IP = '--proxy=%s' % IP_str
service_args.append(proxy_IP)
dcap = dict (DesiredCapabilities.PHANTOMJS)
#创建UA头
dcap[ "phantomjs.page.settings.userAgent" ] = ('Mozilla / 5.0 (baomihua@iPhone;
CPU iPhone OS 9_1 like Mac OS X) AppleWebKit / 601.1 . 46 (KHTML, like Gecko)
Version / 9.0 Mobile / 13B143 Safari / 601.1 ')
#利用phantomjs仿浏览器动作,参数2是代理IP
driver = webdriver.PhantomJS(desired_capabilities = dcap, service_args = service_args)
#设置访问超时时间
driver.implicitly_wait( 60 )
driver.set_page_load_timeout( 60 )
try :
driver.get( '网页地址' )
except :
print "timeout"
finally :
data = driver.page_source
time.sleep( 20 )
req = r "广告元素"
rule1 = re. compile (req)
lists = re.findall(rule1, data)
counts = len (lists)
print counts
# print data
driver.quit()
#判断广告元素是否为22
if counts = = 22 :
print "The webpage is OK!"
result = "The webpage is OK!Find 22 广告元素!
proxy_IP: % s " % IP_str
break
if counts ! = 22 :
#IPS.remove(i)
print "%s is bad!" % i.strip()
result = "The webpage maybe bad"
print "close"
#返回结果和网页代码
return result, data
def send_mail(result,data):
receivers = [ 'XXX@XX.com' ] #接收人
mail_host = 'smtp.exmail.qq.com' #代理邮箱smtp协议
mail_user = 'xxx@xxx.com' #发送人
mail_pass = 'xxxx' #密码
mail_postfix = 'xxxx' #发件箱的后缀
title = str (result)
msg = MIMEText(data, 'plain' , 'utf-8' ) #文本格式内容
me = title.decode( 'utf-8' ) + "<" + mail_user + ">"
msg[ 'Subject' ] = Header(title, 'utf-8' )
msg[ 'From' ] = Header(me, 'utf-8' )
msg[ 'To' ] = Header( ";" .join(receivers), 'utf-8' )
try :
s = smtplib.SMTP()
s.connect(mail_host)
s.login(mail_user, mail_pass)
s.sendmail(me,receivers , msg.as_string())
s.close()
print "发送成功"
return True
except smtplib.SMTPException:
print "Error: 无法发送邮件"
return False
if __name__ = = '__main__' :
while 1 :
print 'start' + ' ' + ''.join(time.ctime(time.time()))
result, data = reptile()
send_mail(result = result, data = data)
print 'stop' + ' ' + ''.join(time.ctime(time.time()))
time.sleep( 600 )
sys.exit( 0 )
|
网友评论