scrapy爬虫实例-阿里云开发者社区

一、爬取电影信息

http://www.imdb.cn/nowplaying/{num} #页面规则

http://www.imdb.cn/title/tt{num} #某部电影信息

获取电影url和title

新建项目

scrapy startproject imdb

修改items.py

 
          # -*- coding: utf-8 -*-
         
          # Define here the models for your scraped items
         
          #
         
          # See documentation in:
         
          # http://doc.scrapy.org/en/latest/topics/items.html
         
          import 
          scrapy 
         
          class 
          ImdbItem(Item): 
         
          # define the fields for your item here like: 
         
          # name = scrapy.Field() 
         
          #url = scrapy.Field()    #url 
         
          #title = scrapy.Field()  #影片名 
         
          video_title 
          = 
          Field() 
         
          video_rating 
          = 
          Field() 
         
          video_name 
          = 
          Field() 
         
          video_alias 
          = 
          Field() 
         
          video_director 
          = 
          Field() 
         
          video_actor 
          = 
          Field() 
         
          video_length 
          = 
          Field() 
         
          video_language 
          = 
          Field() 
         
          video_year 
          = 
          Field() 
         
          video_type 
          = 
          Field() 
         
          video_color 
          = 
          Field() 
         
          video_area 
          = 
          Field() 
         
          video_voice 
          = 
          Field() 
         
          video_summary 
          = 
          Field() 
         
          video_url 
          = 
          Field()

在spiders目录下新建爬虫文件moive.py

 
          # -*- coding: utf-8 -*-
         
          from 
          scrapy.spiders 
          import 
          CrawlSpider, Request, Rule 
         
          from 
          imdb.items 
          import 
          ImdbItem 
         
          from 
          scrapy.linkextractor 
          import 
          LinkExtractor 
         
          class 
          ImdbSpider(CrawlSpider): 
         
          name 
          = 
          'imdb' 
         
          allowed_domains 
          = 
          [
          'www.imdb.cn'
          ] 
         
          rules 
          = 
          ( 
         
          Rule(LinkExtractor(allow
          =
          r
          "/title/tt\d+$"
          ), callback
          =
          "parse_imdb"
          , follow
          =
          True
          ), 
         
          ) 
         
          def 
          start_requests(
          self
          ): 
         
          for 
          i 
          in 
          range
          (
          1
          , 
          20
          ): 
         
          url 
          = 
          "http://www.imdb.cn/nowplaying/" 
          + 
          str
          (i) 
         
          yield 
          Request(url
          =
          url, callback
          =
          self
          .parse) 
         
          def 
          parse_imdb(
          self
          , response): 
         
          item 
          = 
          ImdbItem() 
         
          try
          : 
         
          item[
          'video_title'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          fk
          -
          3
          "]/div[@class="
          hdd"]
          /
          h3
          /
          text()').extract()) 
         
          item[
          'video_rating'
          ] 
          = 
          "".join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="hdd"]/span/i/text()'
          ).extract()) 
         
          content 
          = 
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li'
          ).extract() 
         
          for 
          i 
          in 
          range
          (
          0
          , 
          len
          (content)): 
         
          if 
          "片名" 
          in 
          content[i]: 
         
          if 
          i 
          =
          = 
          0
          : 
         
          item[
          'video_name'
          ] 
          = 
          "".join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[1]/a/text()'
          ).extract()) 
         
          if 
          "别名" 
          in 
          content[i]: 
         
          if 
          i 
          =
          = 
          1
          : 
         
          item[
          'video_alias'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()'
          ).extract()) 
         
          if 
          "导演" 
          in 
          content[i]: 
         
          if 
          i 
          =
          = 
          1
          : 
         
          item[
          'video_director'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()'
          ).extract()) 
         
          elif 
          i 
          =
          = 
          2
          : 
         
          item[
          'video_director'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()'
          ).extract()) 
         
          if 
          "主演" 
          in 
          content[i]: 
         
          if 
          i 
          =
          = 
          2
          : 
         
          item[
          'video_actor'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()'
          ).extract()) 
         
          if 
          i 
          =
          = 
          3
          : 
         
          item[
          'video_actor'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[4]/a/text()'
          ).extract()) 
         
          if 
          "上映时间" 
          in 
          content[i]: 
         
          if 
          i 
          =
          = 
          4
          : 
         
          item[
          'video_year'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a[1]/text()'
          ).extract()) 
         
          a 
          = 
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a'
          ).extract() 
         
          length 
          = 
          len
          (a) 
          - 
          1 
         
          try
          : 
         
          item[
          'video_color'
          ] 
          = 
          "".join( 
         
          response.xpath( 
         
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()'
          ).extract()[length]) 
         
          except 
          Exception as e: 
         
          item[
          'video_color'
          ] 
          = 
          "" 
         
          try
          : 
         
          type 
          = 
          "|"
          .join( 
         
          response.xpath( 
         
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()'
          ).extract()[
          1
          :length]) 
         
          maohao 
          = 
          type
          .split(
          "："
          ) 
         
          if 
          len
          (maohao) > 
          0
          : 
         
          item[
          'video_type'
          ] 
          = 
          maohao[
          0
          ] 
         
          else
          : 
         
          item[
          'video_type'
          ] 
          = 
          "" 
         
          except 
          Exception as e: 
         
          item[
          'video_type'
          ] 
          = 
          "" 
         
          if 
          i 
          =
          = 
          5
          : 
         
          item[
          'video_year'
          ] 
          = 
          "".join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()'
          ).extract()) 
         
          a 
          = 
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a'
          ).extract() 
         
          length 
          = 
          len
          (a) 
          - 
          1 
         
          try
          : 
         
          item[
          'video_color'
          ] 
          = 
          "".join( 
         
          response.xpath( 
         
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()'
          ).extract()[length]) 
         
          except 
          Exception as e: 
         
          item[
          'video_color'
          ] 
          = 
          "" 
         
          try
          : 
         
          type 
          = 
          "|"
          .join( 
         
          response.xpath( 
         
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()'
          ).extract()[
          1
          :length]) 
         
          maohao 
          = 
          type
          .split(
          "："
          ) 
         
          if 
          len
          (maohao) > 
          0
          : 
         
          item[
          'video_type'
          ] 
          = 
          maohao[
          0
          ] 
         
          else
          : 
         
          item[
          'video_type'
          ] 
          = 
          "" 
         
          except 
          Exception as e: 
         
          item[
          'video_type'
          ] 
          = 
          "" 
         
          if 
          "国家" 
          in 
          content[i]: 
         
          if 
          i 
          =
          = 
          5
          : 
         
          item[
          'video_area'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()'
          ).extract()) 
         
          item[
          'video_voice'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[2]/text()'
          ).extract()) 
         
          if 
          i 
          =
          = 
          6
          : 
         
          item[
          'video_area'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[1]/text()'
          ).extract()) 
         
          item[
          'video_voice'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[2]/text()'
          ).extract()) 
         
          item[
          'video_length'
          ] 
          = 
          "".join( 
         
          response.xpath( 
         
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/text()'
          ).extract()).replace( 
         
          "&nbsp"
          , "") 
         
          item[
          'video_language'
          ] 
          = 
          "".join( 
         
          response.xpath(
          '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/a/text()'
          ).extract()) 
         
          item[
          'video_summary'
          ] 
          = 
          "".join( 
         
          response.xpath( 
         
          '//*[@class="fk-4 clear"]/div[@class="bdd clear"]/i/text()'
          ).extract()).lstrip().rstrip().replace( 
         
          "<br>"
          , "") 
         
          item[
          'video_url'
          ] 
          = 
          response.url 
         
          yield 
          item 
         
          except 
          Exception as error: 
         
          log(error)

在spiders目录下新建run.py启动文件

vim run.py

# coding:utf-8

from scrapy import cmdline

cmdline.execute("scrapy crawl imdb".split())

二、有限深度爬取

新建项目

scrapy startproject douban

scrapy中，我们在settings.py设置深度使用DEPTH_LIMIT，例如：DEPTH_LIMIT = 5，该深度是相对于初始请求url的深度

修改settings.py

DEPTH_LIMIT = 4

#豆瓣有反爬虫机制，因此设置延时DOWNLOAD_DELAY

DOWNLOAD_DELAY = 2

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' #设置代理

items.py

 
          from 
          scrapy 
          import 
          Item, Field 
         
          # 音乐
         
          class 
          MusicItem(Item): 
         
          music_name 
          = 
          Field() 
         
          music_alias 
          = 
          Field() 
         
          music_singer 
          = 
          Field() 
         
          music_time 
          = 
          Field() 
         
          music_rating 
          = 
          Field() 
         
          music_votes 
          = 
          Field() 
         
          music_tags 
          = 
          Field() 
         
          music_url 
          = 
          Field() 
         
          class 
          MusicReviewItem(Item): 
         
          review_title 
          = 
          Field() 
         
          review_content 
          = 
          Field() 
         
          review_author 
          = 
          Field() 
         
          review_music 
          = 
          Field() 
         
          review_time 
          = 
          Field() 
         
          review_url 
          = 
          Field()

爬虫文件music.py

 
          # --*-- coding: utf-8 --*--
         
          from 
          scrapy.spiders 
          import 
          CrawlSpider, Rule 
         
          from 
          scrapy.linkextractors 
          import 
          LinkExtractor 
         
          from 
          douban.items 
          import 
          MusicItem, MusicReviewItem 
         
          from 
          scrapy 
          import 
          log 
         
          class 
          ReviewSpider(CrawlSpider): 
         
          name 
          = 
          'review' 
         
          allowed_domains 
          = 
          [
          'music.douban.com'
          ] 
         
          start_urls 
          = 
          [
          'https://music.douban.com/subject/1406522/'
          ] 
         
          rules 
          = 
          ( 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews\?sort=time$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews\?sort=time\&start=\d+$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/review/\d+/$"
          ), callback
          =
          "parse_review"
          , follow
          =
          True
          ), 
         
          ) 
         
          def 
          parse_review(
          self
          ,response): 
         
          try
          : 
         
          item 
          = 
          MusicReviewItem() 
         
          item[
          'review_title'
          ] 
          = 
          "
          ".join(response.xpath('//*[@property="
          v:summary"]
          /
          text()').extract()) 
         
          content 
          = 
          "".join( 
         
          response.xpath(
          '//*[@id="link-report"]/div[@property="v:description"]/text()'
          ).extract() 
         
          ) 
         
          item[
          'review_content'
          ] 
          = 
          content.lstrip().rstrip().replace(
          '\n'
          ,
          " "
          ) 
         
          item[
          'review_author'
          ] 
          = 
          "
          ".join(response.xpath('//*[@property="
          v:reviewer"]
          /
          text()').extract()) 
         
          item[
          'review_music'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          main
          -
          hd"]
          /
          a[
          2
          ]
          /
          text()').extract()) 
         
          item[
          'review_time'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          main
          -
          hd"]
          /
          p
          /
          text()').extract()) 
         
          item[
          'review_url'
          ] 
          = 
          response.url 
         
          yield 
          item 
         
          except 
          Exception as error: 
         
          log(error)

启动命令文件run.py

 
          # --*-- coding: utf-8 --*--
         
          from 
          scrapy 
          import 
          cmdline 
         
          cmdline.execute(
          "scrapy crawl review -o review.json"
          .split())

-o 参数导出结果到review.json文件

多个爬虫组合

我们现在有这么个需求，既要爬取音乐详情又要爬取乐评，既要爬取电影详情又要爬取影评，这个要怎么搞，难道是每一个需求就要创建一个项目么，如果按这种方式，我们就要创建四个项目，分别来爬取音乐、乐评、电影、影评，显然这么做的话，代码不仅有很多重合的部分，而且还不容易维护爬虫

新建项目

scrapy startproject multi

修改settings.py

 
          USER_AGENT 
          = 
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' 
         
          DOWNLOAD_DELAY 
          = 
          2 
         
          修改items.py
         
          from 
          scrapy 
          import 
          Item, Field 
         
          # 音乐
         
          class 
          MusicItem(Item): 
         
          music_name 
          = 
          Field() 
         
          music_alias 
          = 
          Field() 
         
          music_singer 
          = 
          Field() 
         
          music_time 
          = 
          Field() 
         
          music_rating 
          = 
          Field() 
         
          music_votes 
          = 
          Field() 
         
          music_tags 
          = 
          Field() 
         
          music_url 
          = 
          Field() 
         
          # 乐评
         
          class 
          MusicReviewItem(Item): 
         
          review_title 
          = 
          Field() 
         
          review_content 
          = 
          Field() 
         
          review_author 
          = 
          Field() 
         
          review_music 
          = 
          Field() 
         
          review_time 
          = 
          Field() 
         
          review_url 
          = 
          Field() 
         
          # 电影
         
          class 
          VideoItem(Item): 
         
          video_name 
          = 
          Field() 
         
          video_alias 
          = 
          Field() 
         
          video_actor 
          = 
          Field() 
         
          video_year 
          = 
          Field() 
         
          video_time 
          = 
          Field() 
         
          video_rating 
          = 
          Field() 
         
          video_votes 
          = 
          Field() 
         
          video_tags 
          = 
          Field() 
         
          video_url 
          = 
          Field() 
         
          video_director 
          = 
          Field() 
         
          video_type 
          = 
          Field() 
         
          video_bigtype 
          = 
          Field() 
         
          video_area 
          = 
          Field() 
         
          video_language 
          = 
          Field() 
         
          video_length 
          = 
          Field() 
         
          video_writer 
          = 
          Field() 
         
          video_desc 
          = 
          Field() 
         
          video_episodes 
          = 
          Field() 
         
          # 影评
         
          class 
          VideoReviewItem(Item): 
         
          review_title 
          = 
          Field() 
         
          review_content 
          = 
          Field() 
         
          review_author 
          = 
          Field() 
         
          review_video 
          = 
          Field() 
         
          review_time 
          = 
          Field() 
         
          review_url 
          = 
          Field() 
         
          spiders目录下新建两个爬虫文件
         
          videospider.py
         
          # --*-- coding: utf-8 --*--
         
          from 
          scrapy.spiders 
          import 
          CrawlSpider, Rule 
         
          from 
          scrapy.linkextractors 
          import 
          LinkExtractor 
         
          from 
          multi.items 
          import 
          VideoItem, VideoReviewItem 
         
          from 
          scrapy 
          import 
          log 
         
          import 
          re 
         
          AREA 
          = 
          re.
          compile
          (r
          "制片国家/地区:</span> (.+?)<br>"
          ) 
         
          ALIAS 
          = 
          re.
          compile
          (r
          "又名:</span> (.+?)<br>"
          ) 
         
          LANGUAGE 
          = 
          re.
          compile
          (r
          "语言:</span> (.+?)<br>"
          ) 
         
          EPISODES 
          = 
          re.
          compile
          (r
          "集数:</span> (.+?)<br>"
          ) 
         
          LENGTH 
          = 
          re.
          compile
          (r
          "单集片长:</span> (.+?)<br>"
          ) 
         
          class 
          VideoSpider(CrawlSpider): 
         
          name 
          = 
          'video' 
         
          allowed_domains 
          = 
          [
          'movie.douban.com'
          ] 
         
          start_urls 
          = 
          [ 
         
          'https://movie.douban.com/tag/'
          , 
         
          'https://movie.douban.com/tag/?view=cloud' 
         
          ] 
         
          rules 
          = 
          (Rule(LinkExtractor(allow
          =
          r
          "/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews\?start=\d+$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/$"
          ), callback
          =
          "parse_video"
          , follow
          =
          True
          ), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/review/\d+/$"
          ), callback
          =
          "parse_review"
          , follow
          =
          True
          ), 
         
          ) 
         
          def 
          parse_video(
          self
          , response): 
         
          item 
          = 
          VideoItem() 
         
          try
          : 
         
          item[
          "video_url"
          ] 
          = 
          response.url 
         
          item[
          "video_name"
          ] 
          = 
          ''.join( 
         
          response.xpath(
          '//*[@id="content"]/h1/span[@property="v:itemreviewed"]/text()'
          ).extract() 
         
          ) 
         
          try
          : 
         
          item[
          "video_year"
          ] 
          = 
          ''.join( 
         
          response.xpath(
          '//*[@id="content"]/h1/span[@class="year"]/text()'
          ).extract()).replace( 
         
          "("
          , "
          ").replace("
          )
          ", "
          " 
         
          ) 
         
          except 
          Exception as e: 
         
          print
          (
          'Exception:'
          , e) 
         
          item[
          'video_year'
          ] 
          = 
          '' 
         
          introduction 
          = 
          response.xpath(
          '//*[@id="link-report"]/span[@property="v:summary"]/text()'
          ).extract() 
         
          if 
          introduction: 
         
          item[
          "video_desc"
          ] 
          = 
          ''.join(introduction).strip().replace(
          "\r\n"
          , 
          " "
          ) 
         
          else
          : 
         
          item[
          "video_desc"
          ] 
          = 
          ''.join( 
         
          response.xpath(
          '//*[@id="link-report"]/span/text()'
          ).extract()).strip().replace(
          "\r\n"
          , 
          " "
          ) 
         
          item[
          "video_director"
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@id="info"]/span/span/a[@rel="v:directedBy"]/text()'
          ).extract()) 
         
          item[
          "video_writer"
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@id="info"]/span[2]/span[2]/a/text()'
          ).extract()) 
         
          item[
          "video_actor"
          ] 
          = 
          "|"
          .join(response.xpath(
          "//a[@rel='v:starring']/text()"
          ).extract()) 
         
          item[
          "video_type"
          ] 
          = 
          "|"
          .join(response.xpath(
          '//*[@id="info"]/span[@property="v:genre"]/text()'
          ).extract()) 
         
          S 
          = 
          "
          ".join(response.xpath("
          /
          /
          div[@
          id
          =
          'info'
          ]").extract()) 
         
          M 
          = 
          AREA.search(S) 
         
          if 
          M 
          is 
          not 
          None
          : 
         
          item[
          "video_area"
          ] 
          = 
          "|"
          .join([area.strip() 
          for 
          area 
          in 
          M.group(
          1
          ).split(
          "/"
          )]) 
         
          else
          : 
         
          item[
          'video_area'
          ] 
          = 
          '' 
         
          A 
          = 
          "
          ".join(response.xpath("
          /
          /
          div[@
          id
          =
          'info'
          ]").extract()) 
         
          AL 
          = 
          ALIAS.search(A) 
         
          if 
          AL 
          is 
          not 
          None
          : 
         
          item[
          "video_alias"
          ] 
          = 
          "|"
          .join([alias.strip() 
          for 
          alias 
          in 
          AL.group(
          1
          ).split(
          "/"
          )]) 
         
          else
          : 
         
          item[
          "video_alias"
          ] 
          = 
          "" 
         
          video_info 
          = 
          "
          ".join(response.xpath("
          /
          /
          div[@
          id
          =
          'info'
          ]").extract()) 
         
          language 
          = 
          LANGUAGE.search(video_info) 
         
          episodes 
          = 
          EPISODES.search(video_info) 
         
          length 
          = 
          LENGTH.search(video_info) 
         
          if 
          language 
          is 
          not 
          None
          : 
         
          item[
          "video_language"
          ] 
          = 
          "|"
          .join([language.strip() 
          for 
          language 
          in 
          language.group(
          1
          ).split(
          "/"
          )]) 
         
          else
          : 
         
          item[
          'video_language'
          ] 
          = 
          '' 
         
          if 
          length 
          is 
          not 
          None
          : 
         
          item[
          "video_length"
          ] 
          = 
          "|"
          .join([runtime.strip() 
          for 
          runtime 
          in 
          length.group(
          1
          ).split(
          "/"
          )]) 
         
          else
          : 
         
          item[
          "video_length"
          ] 
          = 
          "".join( 
         
          response.xpath(
          '//*[@id="info"]/span[@property="v:runtime"]/text()'
          ).extract()) 
         
          item[
          'video_time'
          ] 
          = 
          "/"
          .join( 
         
          response.xpath(
          '//*[@id="info"]/span[@property="v:initialReleaseDate"]/text()'
          ).extract()) 
         
          if 
          episodes 
          is 
          not 
          None
          : 
         
          item[
          'video_bigtype'
          ] 
          = 
          "电视剧" 
         
          item[
          "video_episodes"
          ] 
          = 
          "|"
          .join([episodes.strip() 
          for 
          episodes 
          in 
          episodes.group(
          1
          ).split(
          "/"
          )]) 
         
          else
          : 
         
          item[
          'video_bigtype'
          ] 
          = 
          "电影" 
         
          item[
          'video_episodes'
          ] 
          = 
          '' 
         
          item[
          'video_tags'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@class="tags"]/div[@class="tags-body"]/a/text()'
          ).extract()) 
         
          try
          : 
         
          item[
          'video_rating'
          ] 
          = 
          "".join(response.xpath( 
         
          '//*[@class="rating_self clearfix"]/strong/text()'
          ).extract()) 
         
          item[
          'video_votes'
          ] 
          = 
          "".join(response.xpath( 
         
          '//*[@class="rating_self clearfix"]/div/div[@class="rating_sum"]/a/span/text()'
          ).extract()) 
         
          except 
          Exception as error: 
         
          item[
          'video_rating'
          ] 
          = 
          '0' 
         
          item[
          'video_votes'
          ] 
          = 
          '0' 
         
          log(error) 
         
          yield 
          item 
         
          except 
          Exception as error: 
         
          log(error) 
         
          def 
          parse_review(
          self
          , response): 
         
          try
          : 
         
          item 
          = 
          VideoReviewItem() 
         
          item[
          'review_title'
          ] 
          = 
          "
          ".join(response.xpath('//*[@property="
          v:summary"]
          /
          text()').extract()) 
         
          content 
          = 
          "".join( 
         
          response.xpath(
          '//*[@id="link-report"]/div[@property="v:description"]/text()'
          ).extract()) 
         
          item[
          'review_content'
          ] 
          = 
          content.lstrip().rstrip().replace(
          "\n"
          , 
          " "
          ) 
         
          item[
          'review_author'
          ] 
          = 
          "
          ".join(response.xpath('//*[@property = "
          v:reviewer"]
          /
          text()').extract()) 
         
          item[
          'review_video'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          main
          -
          hd"]
          /
          a[
          2
          ]
          /
          text()').extract()) 
         
          item[
          'review_time'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          main
          -
          hd"]
          /
          p
          /
          text()').extract()) 
         
          item[
          'review_url'
          ] 
          = 
          response.url 
         
          yield 
          item 
         
          except 
          Exception as error: 
         
          log(error) 
         
          新建musicspider.py 
         
          # --*-- coding: utf-8 --*-- 
         
          from 
          scrapy.spiders 
          import 
          CrawlSpider, Rule 
         
          from 
          scrapy.linkextractors 
          import 
          LinkExtractor 
         
          from 
          multi.items 
          import 
          MusicItem, MusicReviewItem 
         
          from 
          scrapy 
          import 
          log 
         
          import 
          re 
         
          class 
          MusicSpider(CrawlSpider): 
         
          name 
          = 
          "music" 
         
          allowed_domains 
          = 
          [
          'music.douban.com'
          ] 
         
          start_urls 
          = 
          [ 
         
          'https://music.douban.com/tag/'
          , 
         
          'https://music.douban.com/tag/?view=cloud' 
         
          ] 
         
          rules 
          = 
          (Rule(LinkExtractor(allow
          =
          r
          "/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews\?sort=time$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/reviews\?sort=time\&start=\d+$"
          )), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/subject/\d+/$"
          ), callback
          =
          "parse_music"
          , follow
          =
          True
          ), 
         
          Rule(LinkExtractor(allow
          =
          r
          "/review/\d+/$"
          ), callback
          =
          "parse_review"
          , follow
          =
          True
          ), 
         
          ) 
         
          def 
          parse_music(
          self
          ,response): 
         
          item 
          = 
          MusicItem() 
         
          try
          : 
         
          item[
          'music_name'
          ] 
          = 
          response.xpath(
          '//*[@id="wrapper"]/h1/span/text()'
          ).extract()[
          0
          ] 
         
          content 
          = 
          "
          ".join(response.xpath('//*[@id="
          info"]').extract()) 
         
          info 
          = 
          response.xpath(
          '//*[@id="info"]/span'
          ).extract() 
         
          item[
          'music_alias'
          ] 
          = 
          "" 
         
          item[
          'music_singer'
          ] 
          = 
          "" 
         
          item[
          'music_time'
          ] 
          = 
          "" 
         
          for 
          i 
          in 
          range
          (
          0
          , 
          len
          (info)): 
         
          if 
          "又名" 
          in 
          info[i]: 
         
          if 
          i 
          =
          = 
          0
          : 
         
          item[
          'music_alias'
          ] 
          = 
          response.xpath(
          '//*[@id="info"]/text()'
          ).extract()[
          1
          ].replace(
          "\xa0"
          , "
          ").replace("
          \n
          ", "
          ").rstrip() 
         
          elif 
          i 
          =
          = 
          1
          : 
         
          item[
          'music_alias'
          ] 
          = 
          response.xpath(
          '//*[@id="info"]/text()'
          ).extract()[
          2
          ].replace(
          "\xa0"
          , "
          ").replace("
          \n
          ", "
          ").rstrip() 
         
          elif 
          i 
          =
          = 
          2
          : 
         
          item[
          'music_alias'
          ] 
          = 
          response.xpath(
          '//*[@id="info"]/text()'
          ).extract()[
          3
          ].replace(
          "\xa0"
          , "
          ").replace("
          \n
          ", "
          ").rstrip() 
         
          else
          : 
         
          item[
          'music_alias'
          ] 
          = 
          "" 
         
          if 
          "表演者" 
          in 
          info[i]: 
         
          if 
          i 
          =
          = 
          0
          : 
         
          item[
          'music_singer'
          ] 
          = 
          "|"
          .join(response.xpath(
          '//*[@id="info"]/span[1]/span/a/text()'
          ).extract()) 
         
          elif 
          i 
          =
          = 
          1
          : 
         
          item[
          'music_singer'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@id="info"]/span[2]/span/a/text()'
          ).extract()) 
         
          elif 
          i 
          =
          = 
          2
          : 
         
          item[
          'music_singer'
          ] 
          = 
          "|"
          .join( 
         
          response.xpath(
          '//*[@id="info"]/span[3]/span/a/text()'
          ).extract()) 
         
          else
          : 
         
          item[
          'music_singer'
          ] 
          = 
          "" 
         
          if 
          "发行时间" 
          in 
          info[i]: 
         
          nbsp 
          = 
          re.findall(r
          "<span class=\"pl\">发行时间:</span>(.*?)<br>"
          , content, re.S) 
         
          item[
          'music_time'
          ] 
          = 
          "
          ".join(nbsp).replace("
          \xa0
          ", "
          ").replace("
          \n
          ", "
          ").replace(" 
          ", "
          ") 
         
          try
          : 
         
          item[
          'music_rating'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          rating_self clearfix"]
          /
          strong
          /
          text()').extract()) 
         
          item[
          'music_votes'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          rating_self clearfix
          "]/div/div[@class="
          rating_sum"]
          /
          a
          /
          span
          /
          text()').extract()) 
         
          except 
          Exception as error: 
         
          item[
          'music_rating'
          ] 
          = 
          '0' 
         
          item[
          'music_votes'
          ] 
          = 
          '0' 
         
          log(error) 
         
          item[
          'music_tags'
          ] 
          = 
          "|"
          .join(response.xpath(
          '//*[@id="db-tags-section"]/div/a/text()'
          ).extract()) 
         
          item[
          'music_url'
          ] 
          = 
          response.url 
         
          yield 
          item 
         
          except 
          Exception as error: 
         
          log(error) 
         
          def 
          parse_review(
          self
          , response): 
         
          try
          : 
         
          item 
          = 
          MusicReviewItem() 
         
          item[
          'review_title'
          ] 
          = 
          "
          ".join(response.xpath('//*[@property="
          v:summary"]
          /
          text()').extract()) 
         
          content 
          = 
          "".join( 
         
          response.xpath(
          '//*[@id="link-report"]/div[@property="v:description"]/text()'
          ).extract() 
         
          ) 
         
          item[
          'review_content'
          ] 
          = 
          content.lstrip().rstrip().replace(
          "\n"
          , 
          " "
          ) 
         
          item[
          'review_author'
          ] 
          = 
          "
          ".join(response.xpath('//*[@property = "
          v:reviewer"]
          /
          text()').extract()) 
         
          item[
          'review_music'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          main
          -
          hd"]
          /
          a[
          2
          ]
          /
          text()').extract()) 
         
          item[
          'review_time'
          ] 
          = 
          "
          ".join(response.xpath('//*[@class="
          main
          -
          hd"]
          /
          p
          /
          text()').extract()) 
         
          item[
          'review_url'
          ] 
          = 
          response.url 
         
          yield 
          item 
         
          except 
          Exception as error: 
         
          log(error) 
         
          新建启动文件run.py 
         
          # --*-- coding: utf-8 --*-- 
         
          from 
          scrapy 
          import 
          cmdline 
         
          cmdline.execute(
          "scrapy crawl music"
          .split()) 
         
          cmdline.execute(
          "scrapy crawl video"
          .split())

本文转自小白的希望 51CTO博客，原文链接：http://blog.51cto.com/haoyonghui/2061715 ，如需转载请自行联系原作者

scrapy爬虫实例

热门文章

最新文章

相关课程

相关电子书