PHP头条
热点:

PHP制作百度词典查词采集器


百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

<?php
/**
 * dict.class.php 采集百度词典翻译内容
 *
 * @copyright      (C) 2014 widuu
 * @license       http://www.widuu.com
 * @lastmodify     2014-2-15
 */
 
 
header("content-type:text/html;charset=utf8");
class Dict{

	private $word;
	
	//显示的条数
	private static $num = 10;

	public function __construct(){}
	
	
	/**
   * 公用返回百度采集数据的方法
   * @param string 英文单词
   * retun array(
	 *				symbol" => 音标
	 *				"pro"	 => 发音
	 *				"example"=> 例句
	 *				"explain"=> 简明释义
	 *				"synonym"=> 同反义词
	 *				"phrase" => 短语数组
	 *			)
   *
	 */
	public function content($word){
		 $this -> word = $word;
		 $symbol = $this -> Pronounced();
		 $pro	 = $this->getSay();
		 $example = $this -> getExample();
		 $explain = $this -> getExplain();
		 $synonym = $this -> getSynonym();
		 $phrase = $this -> getPhrase();
		 $result = array(
				"symbol" => $symbol,		//音标
				"pro"	 => $pro,			//发音
				"example"=> $example,		//例句
				"explain"=> $explain,		//简明释义
				"synonym"=> $synonym,		//同反义词
				"phrase" => $phrase 		//短语数组
			);
		return $result;
	}


	/**
   * 远程获取百度翻译内容
   * get function curl
   * retun string
   *
	 */

	private function getContent(){
 		$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
 		$ch = curl_init();
 		$url = "http://dict.baidu.com/s?wd=".$this->word;
 		curl_setopt($ch, CURLOPT_URL, $url);
 		curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
		curl_setopt($ch, CURLOPT_HTTPGET, 1);
		curl_setopt($ch, CURLOPT_AUTOREFERER,1);
		curl_setopt($ch, CURLOPT_HEADER, 0); 
		curl_setopt($ch, CURLOPT_TIMEOUT, 30);
		$result = curl_exec($ch);
		if (curl_errno($curl)) {
			echo 'Errno'.curl_error($curl);
		}
		curl_close($ch);
		return $result;
	}


	/**
   * 获取百度翻译发音
   * retun array(英,美)
   *
	 */

	private function Pronounced(){
		$data = $this -> getContent();
		preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);
	}

	/**
	 * 获取百度翻译发音
	 * return array(英,美)
	 *
	 */

	private function getSay(){
		$data = $this -> getContent();
		preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);	
	}

	/**
   * 获取百度翻译例句
   * return array() 多维数组 例句
   * 
	 */

	private function getExample(){
		$str = "";
		$data = $this -> getContent();
		preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);
	  $data1 = "[[[".ltrim($example[1][0],"[");
	  $data2 = explode("[[[",$data1);
	  $num = count(array_filter($data2));
		foreach($data2 as $key => $value){
		 	$data3 = explode("[[","[[".$value);
		 	foreach ($data3 as $k => $v) {
		 		preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);
		 		if(!empty($match[1])){
		 			$str .= implode($match[1]," ")."@";
		 		}
		 	}
		}
		$data4 = trim($str,"@");
		$data5 = explode("@", $data4);
		$result = array_chunk($data5, 2);
		return $result;
	}

	/**
   * 获取简明释义
   * return array (x => "词性",b => "附属")
   * 
	 **/

	private function getExplain(){
		$data = $this -> getContent();
		preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\/Us",$data,$explain);
		$r_data = $explain[1][0];
		preg_match_all("/\\(?P.*)\<\/strong\>\(?P.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);
		preg_match_all("/\(?P[^\>]+)\:\(?P.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);
		
		$result = array();
		foreach ($a_data["adj"] as $key => $value) {
			$result[$value] = $a_data["name"][$key];
		}
		
		$word_b = array();
		foreach ($b_data["tag"] as $key => $value) {
			$word_b[$value] = strip_tags($b_data["word"][$key]);
		}
		
		$result_data = array("x" => $result,"b" => $word_b);

 		return $result_data;
	}


	/**
   * 获取同义词
   * return array(0 => "同义词", 1 => "反义词") 一般为多维数组
   * 
	 */

	private function getSynonym(){
		$data = $this -> getContent();
		preg_match_all("/id=\"en\-syn\-ant\"\>(.*)/Us",$data,$synonym);
		$content = $synonym[1][0];
		$data1 = explode("", $content);
		$result = array();
		$data2 = array();
		foreach ($data1 as $key => $value) {
			preg_match_all("/\(?P.*)\ \;\<\/strong\>\<\/p\>\\(?.*)\<\/ul\>/Us", $value, $r_data);
			$data2[$key]["adj"] = $r_data["adj"];
			$data2[$key]["content"] = $r_data["content"];
		}

		foreach ($data2 as $key => $value) {
			foreach ($value["content"] as $k => $v) {
				if(!empty($v)){
					preg_match_all("/\\(?P.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data);
					foreach ($v_data['title'] as $m => $d) {
						$data = strip_tags(preg_replace("<>"," ", $v_data["value"][$m]));
						$result[$key][$value["adj"][$k]][$d] = $data;
					}
				}
			}
		}
 		return $result;
	}

	/**
   * 获取短语词组
   * return array (key => value) 一维或者多维数组
   * 
	 */

	private function getPhrase(){
		$num = self::$num;
		$data = $this -> getContent();
		preg_match_all("/id=\"en\-phrase\"\>(.*)\<p class\=\"source\"\>/Us",$data,$phrase);
		$data = explode("</dd>",$phrase[1][0]);
		$data1 = array_slice($data,0,$num);
		$result = array();
		foreach ($data1 as $key => $value) {
			$data2 = explode("</p>", $value);
			$n = count($data2);
			if($n<=3){
				$result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);
			}else{
				$data3 = array_slice($data2,0,$n-1);
				$data4 = array_slice($data2,0,2);
				$res = array_diff($data3,$data4);
				$data5 = array_chunk($res,2);
				$key_value = trim(str_replace(" ","",strip_tags($data4[0])));
				$result[$key_value] = strip_tags($data4[1]);
				foreach ($data5 as $key => $value) {
					foreach ($value as $k => $v) {
						$value[$k] = strip_tags($v);
					}
					$array = array($result[$key_value],$value);
					if (array_key_exists($key_value, $result)){
						$result[$key_value] = $array;
					}
				}
				
			}
		}
		return $result;
	}

	/**
	 * 将数组转换为字符串
	 *
	 * @param  array  $data    数组
	 * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1
	 * @return  string 返回字符串,如果,data为空,则返回空
	 */
	private function array2string($data, $isformdata = 1) {
	  if($data == '') return '';
	  if($isformdata) $data = $this->new_stripslashes($data);
	  return addslashes(var_export($data, TRUE));
	}

	/**
	 * 返回经stripslashes处理过的字符串或数组
	 * @param $string 需要处理的字符串或数组
	 * @return mixed
	 */
	private function new_stripslashes($string) {
	  if(!is_array($string)) return stripslashes($string);
	  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
	  return $string;
	}

}

// $word = new dict("express");
// $word ->content();</pre>
</p>
<p>以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。</p>
<p align="left"><div style="display:none;"><span id="url" itemprop="url">/php/25669.html</span><span id="indexUrl" itemprop="indexUrl">www.phpzy.com</span><span id="isOriginal" itemprop="isOriginal">true</span><span id="isBasedOnUrl" itemprop="isBasedOnUrl">/php/25669.html</span><span id="genre" itemprop="genre">TechArticle</span><span id="description" itemprop="description">PHP制作百度词典查词采集器 百度dict 采集样本 写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地...</span></div></p></div>
<div class="art_confoot"><script src='http://www.phpzy.com/ad/art_confoot.js' type="text/javascript"></script></div>
<div class="page"></div>
<div class="post-related"> <h3 class="tit_3">相关文章</h3><div class="clearfix m_5">
<ul> <li><a href='/php/25668.html' title='分享下php5类中三种数据类型的区别' target='_blank'>分享下php5类中三种数据类型的区别</a></li><li><a href='/php/25667.html' title='js+php实现静态页面实时调用用户登陆状态的方法' target='_blank'>js+php实现静态页面实时调用用户登陆状态</a></li><li><a href='/php/25666.html' title='php读取csv数据保存到数组的方法' target='_blank'>php读取csv数据保存到数组的方法</a></li><li><a href='/php/25665.html' title='一个经典的PHP验证码类分享' target='_blank'>一个经典的PHP验证码类分享</a></li><li><a href='/phprm/25664.html' title='PHP下传图片功能' target='_blank'>PHP下传图片功能</a></li><li><a href='/phprm/25663.html' title='php汉字婚配' target='_blank'>php汉字婚配</a></li></ul></div>
</div>
<div class="option-btns">
<div class="art_confoot"><script src='http://www.phpzy.com/ad/xgart_confoot.js' type="text/javascript"></script></div>
</div>
		
		<div  id="related_reading" class="haman-box">
		<ul class="xgyd clearfix">
 <div class="xgyd_new"><span class="fast-nav-bar"><a href="http://www.phpzy.com/fenlei/list-11-1.html">今日最新</a></span><strong>相关阅读:</strong></div>
 <li><a href="/php/25668.html">分享下php5类中三种数据类型的区别</a></li>
<li><a href="/php/25667.html">js+php实现静态页面实时调用用户登陆状态的方</a></li>
<li><a href="/php/25666.html">php读取csv数据保存到数组的方法</a></li>
<li><a href="/php/25665.html">一个经典的PHP验证码类分享</a></li>
<li><a href="/phprm/25664.html">PHP下传图片功能</a></li>
<li><a href="/phprm/25663.html">php汉字婚配</a></li>

 </ul></div>
<footer><div class="hot_c"><span><b>相关频道:</b>
<a href="/fenlei/list-1-1.html" >php教程</a>  <a href="/fenlei/list-2-1.html" >php安全</a>  <a href="/fenlei/list-3-1.html" >php面试题</a>  <a href="/fenlei/list-4-1.html" >php框架</a>  <a href="/fenlei/list-6-1.html" >php入门</a>  <a href="/fenlei/list-7-1.html" >php问答</a>  <a href="/fenlei/list-8-1.html" >php应用</a>  <a href="/fenlei/list-10-1.html" >php职业规划</a>  <a href="/fenlei/list-11-1.html" >今日最新</a>  <a href="/fenlei/list-5-1.html" >php资讯</a>  </span></div> </footer> 
</div>
<div class="info_more" id="info_more"></div>
<div class="clearfix mt10 art_commentstop" id="commentTopAd"><script src='http://www.phpzy.com/ad/art_commentstop.js' type="text/javascript"></script></div>
<div id="hm_t_46468"></div>
<a name="comment"></a><div class="comment"  id="commentTopAd" itemprop="comment"><h3>PHP之友评论</h3></div>
 <div class="wb_comment_box"  id="commentsiframe"><script type="text/javascript" src='http://www.phpzy.com/ad/comments.js'></script></div>
</article>
<div class="syzp mt10" style="overflow:hidden;"><div class="tit_7">今天推荐</div><script type="text/javascript" src="http://www.phpzy.com/ad/left_foot_ad.js"></script></div>  
</div>
<aside class="right" id="main_right">
<div class="art_rightad1"><script src='http://www.phpzy.com/ad/art_rightad1.js' type="text/javascript"></script></div>
<div class="r_bd mt10 pb10">
       <div class="tit_5 tit_6">php教程最近更新</div>
         <ul id="bbsRank_1" class="rank_ul2 rank_dot" style="border-top:1px solid #AAC5F2;margin-top: -1px;">
	<li><a href="/php/25669.html">PHP制作百度词典查词采集器</a> </li>
<li><a href="/php/25668.html">分享下php5类中三种数据类型的区别</a> </li>
<li><a href="/php/25667.html">js+php实现静态页面实时调用用户登陆状态</a> </li>
<li><a href="/php/25666.html">php读取csv数据保存到数组的方法</a> </li>
<li><a href="/php/25665.html">一个经典的PHP验证码类分享</a> </li>

</ul></div>
<div class="art_rightad2 mt10"><script src='http://www.phpzy.com/ad/art_rightad2.js' type="text/javascript"></script></div>
<div class="r_bd mt10 pb10">
       <div class="tit_5 tit_6">热门推荐</div>
	   <ul id="bbsRank_1" class="rank_ul2 rank_dot" style="border-top:1px solid #AAC5F2;margin-top: -1px;">
	<li><a href="/php/2149.html">phpcurl大批量抓取抓几百条后提示连接被重</a> </li>
<li><a href="/php/10525.html">php调用google在线翻译功能</a> </li>
<li><a href="/php/15266.html">利用PHP和AJAX创建RSS聚合器的代码</a> </li>
<li><a href="/php/7411.html">基本数据结构和php内置函数实现</a> </li>
<li><a href="/php/315.html">php打开本地exe程序,js打开本地exe应用程序</a> </li>

	</ul>
        </div>
<div class="r_bd mt10 pb10"><div style="margin-top: 0pt;" class="tit_5 tit_6">有意思</div>
<script type="text/javascript" src="http://www.phpzy.com/ad/right_ad5.js"></script></div>
<div class="art_rightad3"><script src='http://www.phpzy.com/ad/art_rightad3.js' type="text/javascript"></script></div>
<div id="focus_look" class="instant-focus mt10"><div class="instant-focus-header clearfix"><h3>实时看点</h3><span>看啥好</span></div>
<script type="text/javascript" src="http://www.phpzy.com/ad/right_ad6.js"></script></div>
<div class="art_rightad4 mt10"><script src='http://www.phpzy.com/ad/art_rightad4.js' type="text/javascript"></script></div>
</aside></div></div>
<footer id="footer" class="div_body">
<script type="text/javascript" src="http://www.phpzy.com/ad/arc_foot_ad.js"></script>
<script type="text/javascript" src="http://www.phpzy.com/templets/js/foot.js"></script>
<div style="display:none;"><script src='http://www.phpzy.com/ad/tongji.js' type="text/javascript"></script></div>
<div id="roll"></i><a title="回顶部" id="roll_top" href="#top" style="opacity: 0.7;" target="_self" rel="nofllow"></a></div>
</footer>
<script type="text/javascript" src="http://www.phpzy.com/ad/maintop.js?131231"></script>
</body>
</html>