PHP头条
热点:

curl采集 根据关键词 获取雅虎竞价排名


之前写过curl批处理采集数据,这里贴上完整版本,代码很简单,废话不说,上代码,新手欢迎指教!!!

代码只写到 获取到链接了,至于排名 后边数组的键不就是排名喽。。。

  1 <?php
  2 /**
  3  * Based on yahoo access to data
  4  *
  5  * @author chujiu <527891885@qq.com>
  6  * @copyright 2014.04.26 By chujiu
  7  * @version 0.2.1 2014.04.26
  8  */
  9 
 10 class DataCollectionRank {
 11 
 12     const   PAGE = 10;
 13     public  $path = '';
 14     public  $main = 91;
 15     
 16     // 添加curl句柄 返回资源
 17     private function _gather_data($keyword) {
 18         if(empty($keyword)) {
 19             return '';
 20         }
 21         $chs = array(); // 句柄
 22         $mh = curl_multi_init();
 23         for( $i=1; $i<=$this->main; $i+=self::PAGE ) {
 24             $url = 'http://search.yahoo.co.jp/search?p='.urlencode($keyword).'&tid=top_ga1_sa&ei=UTF-8&aq=-1&oq='.urlencode($keyword).'&pstart=1&fr=top_ga1_sa&b='.$i;
 25             $ch = curl_init();
 26             //设置选项
 27             curl_setopt_array($ch, array(
 28                 CURLOPT_URL => $url,
 29                 CURLOPT_HEADER => false,
 30                 CURLOPT_SSL_VERIFYPEER => false,
 31                 CURLOPT_RETURNTRANSFER => true,
 32                 CURLOPT_TIMEOUT => 30,
 33                 CURLOPT_AUTOREFERER => true
 34                 )
 35             );
 36             curl_multi_add_handle($mh, $ch); // 添加批处理句柄
 37             $chs['handle'][$i]['ch'] = $ch;
 38             $chs['handle'][$i]['url'] = $url;
 39         }
 40         $chs['mh'] = $mh;
 41         return $chs;
 42     }
 43     
 44     // 处理CURL请求
 45     public function exec_curl_get_data($keyword, $path) {
 46         $error = '';
 47         $this->path = $path;
 48         $chs = $this->_gather_data($keyword);
 49         if(empty($chs)) return ''; 
 50          // 执行批处理句柄
 51         $active = null;
 52         do {
 53            $mrc = curl_multi_exec($chs['mh'],$active);
 54            //$info = curl_multi_info_read($chs['mh']);
 55         } while ($active > 0);
 56         // 获取数据
 57         $responses = array();
 58         foreach($chs['handle'] as $k=>$ch){ 
 59             if(curl_error($ch['ch'])){
 60                 $error .= "\n".'error提示:'.curl_error($ch['ch']).'-------URL:'.$ch['url'].'--------时间:'.date('Y-d-m H:i:s',time())."\n";
 61             } else {
 62                 $responses[$k]['data'] = curl_multi_getcontent( $ch['ch'] );
 63             }
 64             
 65             //curl_multi_info_read($mh);
 66             // close current handler 
 67             curl_multi_remove_handle($chs['mh'], $ch['ch']); 
 68             curl_close($ch['ch']);
 69         }
 70         //关闭curl 批处理
 71         curl_multi_close($chs['mh']);
 72         $str = '';
 73         if($error != '') {
 74             $this->_writeFile('get_rank_log.txt', $error, 'ab+');
 75         }
 76         foreach ($responses as $val) {
 77             if(!empty($val['data'])) {
 78                 $str.= $this->_get_keyword_link_preg($val['data']);
 79             }
 80         }
 81         $str = substr($str, 0 ,-1);
 82         $contents = explode('|', $str);
 83         return $contents;
 84     }
 85 
 86     // 过滤数据 获取链接
 87     private function _get_keyword_link_preg ($str) {
 88         $res = '';
 89         if(empty($str)) {
 90             return '';
 91         }
 92         $arr = explode('<div id="web">', $str);
 93         $arr1 = explode('<div id="posS" class="spns">', $arr[1]);
 94         $arr2 = preg_replace('#<div id=\"pg\">[\s\S]+#', '', $arr1[0]);
 95         $arr3 = preg_replace('#<div id=\"rel\">[\s\S]+#', '', $arr2);
 96         $arr4 = preg_replace('#<em>[\s\S]+?</em>#', '', $arr3);
 97         if(preg_match_all('#href=\"(.*?)\">#',$arr4,$arr5) !== false) {
 98             foreach($arr5[1] as $val) {
 99                 $res.= urldecode($val).'|';
100             }
101         }
102         return $res;
103     }
104 
105     // 写入文件
106     public function _writeFile($fileName, $data, $method="rb+", $iflock=1, $check=1, $chmod=1){
107         $check && @strpos($this->path.'/'.$fileName, '..')!==false && exit('403 Forbidden!');
108         @touch($this->path.'/'.$fileName);
109         $handle = @fopen($this->path.'/'.$fileName, $method);
110         if($iflock) {
111             @flock($handle,LOCK_EX);
112         }
113         $fw = @fwrite($handle,$data);
114         if($method == "rb+") ftruncate($handle, strlen($data));
115         fclose($handle);
116         $chmod && @chmod($this->path.'/'.$fileName,0777);
117     }
118 }
119 ?>

 

 1 function array_unique_fb($array){
 2     $temp = array();
 3     $data = array();
 4     foreach ($array as $value){
 5         $value = join(",",$value); //降维,也可以用implode,将一维数组转换为用逗号连接的字符串
 6         $temp[] = $value;
 7     }
 8         $temp = array_flip(array_flip($temp));    //去掉重复的字符串,也就是重复的一维数组
 9     foreach ($temp as $k => $value){
10         $temp[$k] = explode(",",$value);   //再将拆开的数组重新组装
11     }
12     foreach ($temp as $key => $value) {
13         $data[$key]['keyword'] = $value[0];
14         $data[$key]['domain'] = $value[1];
15     }
16     return $data;
17 }

 

 

www.phpzy.comtrue/php/9121.htmlTechArticlecurl采集 根据关键词 获取雅虎竞价排名 之前写过curl批处理采集数据,这里贴上完整版本,代码很简单,废话不说,上代码,新手欢迎指教!!! 代码只写到 获取到链接了,至于排名...

相关文章

    暂无相关文章
相关频道:

PHP之友评论

今天推荐