function String_html($url) //fopen方式获取url对应的html代码,返回没有过滤html的正文
{
$OutString = "";
$fp = fopen($url,"r");
while(!feof($fp))
{
$OutString .= fgets($fp);
}
fclose($fp);
return $OutString;
}
function match_google($url) {
$html = googleHTML("www.google.cn", $url);
preg_match_all( "/<b>([0-9,]+)<\/b>/" ,$html, $res);
//print_r($res[0]);
//print_r($res[1]);
return $res[1][0];
}
//由于Google对php的自动抓取函数fopen不响应,所以采用socket的方式请求数据
function googleHTML($host, $url) {
$httphead =
"GET ".$url." HTTP/1.1\r\n".
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n".
"Accept-Language: zh-cn,zh;q=0.5\r\n".
"User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/2008052906 Firefox/3.0\r\n".
//模拟浏览器的User-Agent(此处我copy的firefox的)
"Accept-Encoding: gzip\r\n".//接受gzip压缩,会省一半以上的带宽的
"Accept-Charset: utf-8;q=0.7,*;q=0.7\r\n".
"Connection: Close\r\n\r\n";
$length = strlen($httphead);
$sock = fsockopen($host,80);
fwrite($sock,$httphead,$length);
$buffer = "";
for ($i=0; !feof($sock); $i++) {
if($i > 12) {
$buffer .= fgets($sock,4096);
} else {
fgets($sock,4096);
}
}
return unGzip(unChunked($buffer));
}
function unGzip($content){//对gzip编码的数据解压缩
$singal = "\x1F\x8B\x08";
$slen = strlen($singal);
if(substr($content,0,$slen) == $singal){
$content = substr($content,10);
$content = gzinflate($content);
}
return $content;
}
function unChunked($content){//Chunked解码
$pos = strpos($content,"\x0d\x0a");
if($pos > 0 && $pos < 20){
$content = substr($content,$pos+2);
}
$content = preg_replace("/\x0d\x0a[0-9a-f]+?\x0d\x0a/is",'',$content);
if(substr($content,-2) == "\r\n") $content = substr($content,0,strlen($content)-2);
$content = str_replace("\r\n2000\r\n",'',$content);
return $content;
}