爬虫入门之线程进程协程抓取方法(八)

简介: 1 多线程抓取import lxmlfrom lxml import etreeimport requestsimport threadingimport timerlock = threading.

1 多线程抓取


import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        # 西湖 https://hz.lianjia.com/ershoufang/xihu/  将其变成字典
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=gethouseInfo, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()
    print(time.clock())

2 多协程抓取

import gevent
from gevent import monkey
gevent.monkey.patch_all()   #有些需要刚开始进行初始化
import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多协程
    # gevent.monkey.patch_all()  # 非阻塞io  如果此处不行则需要在最上方导入
    geventList = []
    for k, v in areaDict.items():
        g = gevent.spawn(gethouseInfo, k, v)
        geventList.append(g)
    gevent.joinall(geventList)
    print(time.clock())

3 多进程抓取

import lxml
from lxml import etree
import requests

import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]

        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多进程
    processList = []
    for areaName, url in areaDict.items():
        t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url)) #开启多进程
        # 开启
        processList.append(t)
        t.start()

    # 保证线程都结束
    for i in processList:
        i.join()
    print(time.clock())

4 多线程加协程

import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import threading
import time

rlock = threading.RLock()  # 递归锁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]

        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]

        # 总价
        # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)

        with rlock:
            print(areaName)
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
                f.flush()

def getPageNum(areaName, url):
    '''
    获取当前页面
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    threadList = []
    for areaName, url in areaDict.items():
        t = threading.Thread(target=getPageNum, args=(areaName, url))
        # 开启
        threadList.append(t)
        t.start()

    # 保证线程都结束
    for i in threadList:
        i.join()

    print(time.clock())

5 多进程加协程


import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json

import lxml
from lxml import etree
import requests
import multiprocessing
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

  # 非阻塞IO
def getArea(url):
    '''
    获取区域名和链接
    :param url: 种子
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
    # 存储地址和链接
    areaDict = {}
    for area in areaList:
        # 区名
        areaName = area.xpath('./text()')[0]
        # url
        areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
        print(areaName, areaurl)
        areaDict[areaName] = areaurl
    return areaDict

def gethouseInfo(areaName, url):
    '''
    获取房子信息
    :param areaname: 地区名
    :param url: 区域的url
    :return:
    '''
    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)

    sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class=\"clear\"]")
    for house in sellList:
        # 概述
        title = house.xpath('.//div[@class="title"]/a/text()')[0]
        # url
        houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
        # 房子信息
        houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + \
                    house.xpath('.//div[@class="houseInfo"]/text()')[0]
        # 位置信息
        positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + \
                       house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
        # 总价
        totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
        # 平方价
        unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
        print(areaName)
        with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
            f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '\n')
            f.flush()


def getPageNum(areaName, url):

    response = requests.get(url, headers=headers).text
    mytree = lxml.etree.HTML(response)
    pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
    pageNum = json.loads(pageNum)  # json数据
    pageNum = pageNum['totalPage']

    geventList = []
    for i in range(1, int(pageNum) + 1):
        newurl = url + "pg%d/" % i
        g = gevent.spawn(gethouseInfo, areaName, newurl)
        geventList.append(g)
    gevent.joinall(geventList)

if __name__ == '__main__':
    starUrl = "https://hz.lianjia.com/ershoufang/"
    areaDict = getArea(starUrl)
    time.clock()
    print(areaDict)
    # 多线程
    processList = []
    for areaName, url in areaDict.items():
        # 开启多进程
        p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
        processList.append(p)
        p.start()

    # 保证进程都结束
    for i in processList:
        i.join()
    print(time.clock())
相关实践学习
基于函数计算快速搭建Hexo博客系统
本场景介绍如何使用阿里云函数计算服务命令行工具快速搭建一个Hexo博客。
相关文章
|
9天前
|
安全 Java 数据处理
Python网络编程基础(Socket编程)多线程/多进程服务器编程
【4月更文挑战第11天】在网络编程中,随着客户端数量的增加,服务器的处理能力成为了一个重要的考量因素。为了处理多个客户端的并发请求,我们通常需要采用多线程或多进程的方式。在本章中,我们将探讨多线程/多进程服务器编程的概念,并通过一个多线程服务器的示例来演示其实现。
|
24天前
|
消息中间件 安全 Linux
线程同步与IPC:单进程多线程环境下的选择与权衡
线程同步与IPC:单进程多线程环境下的选择与权衡
57 0
|
25天前
|
消息中间件 存储 算法
【软件设计师备考 专题 】操作系统的内核(中断控制)、进程、线程概念
【软件设计师备考 专题 】操作系统的内核(中断控制)、进程、线程概念
68 0
|
25天前
|
消息中间件 Linux 调度
【Linux 进程/线程状态 】深入理解Linux C++中的进程/线程状态:阻塞,休眠,僵死
【Linux 进程/线程状态 】深入理解Linux C++中的进程/线程状态:阻塞,休眠,僵死
65 0
|
2天前
|
调度 Python
Python多线程、多进程与协程面试题解析
【4月更文挑战第14天】Python并发编程涉及多线程、多进程和协程。面试中,对这些概念的理解和应用是评估候选人的重要标准。本文介绍了它们的基础知识、常见问题和应对策略。多线程在同一进程中并发执行,多进程通过进程间通信实现并发,协程则使用`asyncio`进行轻量级线程控制。面试常遇到的问题包括并发并行混淆、GIL影响多线程性能、进程间通信不当和协程异步IO理解不清。要掌握并发模型,需明确其适用场景,理解GIL、进程间通信和协程调度机制。
18 0
|
17天前
|
安全 Linux API
Android进程与线程
Android进程与线程
18 0
|
20天前
|
Java 测试技术 Python
Python开启线程和线程池的方法
Python开启线程和线程池的方法
14 0
Python开启线程和线程池的方法
|
24天前
|
存储 算法 Linux
【Linux 系统标准 进程资源】Linux 创建一个最基本的进程所需的资源分析,以及线程资源与之的差异
【Linux 系统标准 进程资源】Linux 创建一个最基本的进程所需的资源分析,以及线程资源与之的差异
25 0
|
24天前
|
数据采集 存储 Rust
Rust高级爬虫:如何利用Rust抓取精美图片
Rust高级爬虫:如何利用Rust抓取精美图片
|
26天前
|
并行计算 Python
Python中的并发编程:多线程与多进程的比较
在Python编程中,实现并发操作是提升程序性能的重要手段之一。本文将探讨Python中的多线程与多进程两种并发编程方式的优劣及适用场景,帮助读者更好地选择合适的方法来提高程序运行效率。

相关实验场景

更多