Scrapy是一款流行的网络爬虫框架,支持Python3。

建立项目

# 安装scrapy
pip3 install scrapy


# 创建项目,这时候scrapy会在当前目录新建项目与文件夹
scrapy startproject cn_proxy

# 新建一个叫basic的web爬虫
scrapy genspider basic web

# 查看目前结构
$ tree .
.
├── cn_proxy
│   ├── __init__.py
│   ├── __pycache__
│   │   ├── __init__.cpython-37.pyc
│   │   └── settings.cpython-37.pyc
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       ├── __pycache__
│       │   ├── __init__.cpython-37.pyc
│       │   └── basic.cpython-37.pyc
│       └── basic.py
└── scrapy.cfg

4 directories, 12 files

编写爬虫

# 编写爬虫 ./cn_proxy/spider/basic.py
# 使用xpath抽取IP、端口与位置,最后写入items列表后返回
# -*- coding: utf-8 -*-
import scrapy


class BasicSpider(scrapy.Spider):
    name = 'basic'
    allowed_domains = ['web']
    start_urls = ['https://cn-proxy.com']

    def parse(self, response):
        ip_list = response.xpath('//tbody/tr')
        items = []
        for i in ip_list:
            item = {}
            item['IP'] = i.xpath('td/text()').extract()[0]
            item['PORT'] = i.xpath('td/text()').extract()[1]
            item['LOCATION'] = i.xpath('td/text()').extract()[2]
            items.append(item)
        return items

测试爬虫

$ scrapy parse --spider=basic https://www.cn-proxy.com
# 返回数据,说明我们的爬虫获取的数据是正常的

>>> STATUS DEPTH LEVEL 1 <<<
# Scraped Items  ------------------------------------------------------------
[{'IP': '117.191.11.107', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.111', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.111', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.110', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.105', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.113', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.77', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.78', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '112.35.56.134', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '101.4.136.34', 'LOCATION': '北京 Beijing', 'PORT': '81'},
 {'IP': '117.191.11.80', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.78', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.80', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '120.210.219.103', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '120.210.219.101', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.71', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '47.94.200.124', 'LOCATION': '浙江 Hangzhou', 'PORT': '3128'},
 {'IP': '149.129.70.226', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.108', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.113', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.75', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.79', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.77', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '39.137.69.10', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '116.114.19.204', 'LOCATION': 'Nei Mongol Hohhot', 'PORT': '443'},
 {'IP': '120.210.219.101', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '120.210.219.105', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '120.210.219.102', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '121.40.138.161', 'LOCATION': '浙江 Hangzhou', 'PORT': '8000'},
 {'IP': '60.205.188.24', 'LOCATION': '浙江 Hangzhou', 'PORT': '3128'},
 {'IP': '106.12.147.81', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '39.137.77.66', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '59.49.72.137', 'LOCATION': 'Shanxi Sheng Taiyuan Shi', 'PORT': '80'},
 {'IP': '39.137.69.7', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '47.94.230.42', 'LOCATION': '浙江 Hangzhou', 'PORT': '9999'},
 {'IP': '39.137.69.6', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '47.94.89.87', 'LOCATION': '浙江 Hangzhou', 'PORT': '3128'},
 {'IP': '117.191.11.106', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.104', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.103', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.101', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '119.41.236.180', 'LOCATION': 'Hainan Haikou', 'PORT': '8010'},
 {'IP': '117.191.11.102', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '39.137.69.7', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.109', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.72', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.73', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.112', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.73', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.103', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.72', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.105', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '39.137.77.66', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '120.210.219.104', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '120.210.219.104', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '120.210.219.102', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '120.210.219.103', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.101', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.110', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.71', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '117.191.11.109', 'LOCATION': '北京 Beijing', 'PORT': '80'},
 {'IP': '101.231.104.82', 'LOCATION': '上海 Shanghai', 'PORT': '80'},
 {'IP': '117.191.11.108', 'LOCATION': '北京 Beijing', 'PORT': '8080'},
 {'IP': '117.191.11.106', 'LOCATION': '北京 Beijing', 'PORT': '80'}]

# Requests  -----------------------------------------------------------------
[]
# 也可以将items列表保存为文件

# 将item列表保存为item.json文件至当前目录
$scrapy crawl basic -o item.json

# 也可以保存为*.jl *.csv格式
$scrapy crawl basic -o item.jl
$scrapy crawl basic -o item.csv
Last modification:May 24th, 2019 at 04:34 pm