通过curl模拟多线程抓取

<?php
//此模型虽然是一次多个url请求,但缺陷是 要等所有数据请求结束一起返回,才能逐个处理数据。
$start = microtime(true);

header('Content-type:text/html;charset=utf-8');

$arrs = [
    'https://www.yahoo.com/',
    'https://www.sohu.com/',
    'http://www.qq.com/',
    'http://www.sina.com.cn/',
    'http://www.163.com/'
];

$headers = array(
    'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
);

$mh = curl_multi_init();

foreach ($arrs as $i=>$url){
    $conn[$i] = curl_init($url);
    curl_setopt($conn[$i],CURLOPT_RETURNTRANSFER,1);
    curl_setopt($conn[$i], CURLOPT_HTTPHEADER,$headers);
    curl_setopt($conn[$i], CURLOPT_HEADER, 0);
    curl_setopt($conn[$i], CURLOPT_TIMEOUT, 20);


    if (strpos($url,'https')){
        curl_setopt ( $conn[$i], CURLOPT_SSL_VERIFYPEER, false );
        curl_setopt ( $conn[$i], CURLOPT_SSL_VERIFYHOST, 2 );
    }
    curl_multi_add_handle($mh,$conn[$i]);
}

$active = null;
/*
 * 这样写会轻易导致CPU占用100%

do {
    $n=curl_multi_exec($mh,$active);
} while ($active);
 *
 */

//改写
/*
do {
    $mrc = curl_multi_exec($mh,$active);
}while($mrc == CURLM_CALL_MULTI_PERFORM);

while ($active and $mrc == CURLM_OK){
    if (curl_multi_select($mh) != -1) {
        do {
            $mrc = curl_multi_exec($mh, $active);
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
    }
}
*/

//最简单方案
do {
    curl_multi_exec($mh, $running);
    curl_multi_select($mh);
} while ($running > 0);


//获取内容
foreach ($arrs as $i => $url) {
    $res[$i]=curl_multi_getcontent($conn[$i]);
    var_dump($res[$i]);
    curl_close($conn[$i]);
    //等待所有http请求结束返回数据依次生成文件。
    file_put_contents('curl_multi.log', $res[$i]."\r\n\r\n\r\n\r\n", FILE_APPEND);
}

$end = microtime(true) - $start;

echo '<br/>';
echo $end; // 平均 10.091157913208s

常用DNS

Cloudflare
1.1.1.1
1.0.0.1

SDNS/CNNIC
1.2.4.8
210.2.4.8

Level3
4.2.2.1
4.2.2.2

google
8.8.8.8
8.8.4.4

Comodo
8.26.56.26
8.20.247.20

Quad9/IBM
9.9.9.9
149.112.112.112

FreeDNS
37.235.1.174
37.235.1.177

Verisign
64.6.64.6
64.6.65.6

Yandex
77.88.8.8
77.88.8.1

Freenom
80.80.80.80
80.80.81.81

114DNS
114.114.114.114
114.114.115.115

广东联通
116.116.116.116
221.5.88.88

OneDNS
117.50.11.11
117.50.22.22

腾讯
119.29.29.29
182.254.116.116

百度 /CloudXNS
180.76.76.76
124.251.124.251

OpenDNS
208.67.222.222
208.67.220.220

DynDNS
216.146.35.35
216.146.36.36

江苏电信
218.2.2.2
218.4.4.4

电信联通
222.88.88.88
119.6.6.6

阿里云
255.5.5.5
255.6.6.6

Ubuntu 14安装Python 3.6

安装python3.6

sudo add-apt-repository ppa:jonathonf/python-3.6
sudo apt-get update
sudo apt-get install python3.6

如果提示没有add-apt-repository

sudo apt-get install software-properties-common

用python3.6替换python2.7

sudo rm /usr/bin/python
sudo ln -s /usr/local/bin/python3.6 /usr/bin/python

python –version