您当前的位置: 首页 > 网站编程 > PHP教程 > PHP抓取、分析国内视频网站的视频信息工具类

PHP抓取、分析国内视频网站的视频信息工具类

作者:不详 来源:网络 发布时间: 2014-08-01 23:54 点击:
VideoUrlParser是一款基于PHP根据视频URL抓取视频信息的工具,支持优酷、土豆、酷六、56、乐视、搜狐、腾讯、新浪。 使用方法: 代码如下: require_once VideoUrlParser.class.php; $url = http://v.youku.com/v_show/id_XMjkwMzc0Njg4.html; $info = VedioUrlParser::

PHP抓取、分析国内视频网站的视频信息工具类

  VideoUrlParser是一款基于PHP根据视频URL抓取视频信息的工具,支持优酷、土豆、酷六、56、乐视、搜狐、腾讯、新浪。

  使用方法:

  代码如下:

  require_once "VideoUrlParser.class.php";

  $url = "http://v.youku.com/v_show/id_XMjkwMzc0Njg4.html";

  $info = VedioUrlParser::parse($url);

  echo $info;

  说明:调用该工具php文件VideoUrlParser.class.php,$url变量后面的字符串为视频页的地址,然后使用echo输出变量$info。

  附:info含有的几个值,分别是img(用于视频缩略图),title(视频标题),url(地址),swf(视频swf播放地址)。我只用到了img和swf地址。具体的可以根据自己的需要进行调整。

  VideoUrlParser类源码:

  代码如下:<?php

  /**

  * Video

  *

  * @package

  * @version 1.2

  * @copyright 2005-2011 HDJ.ME

  * @author Dijia Huang <huangdijia@gmail.com>

  * @license PHP Version 3.0 {@link http://www.php.net/license/3_0.txt}

  *

  * Usage

  * require_once "VideoUrlParser.class.php";

  * $urls[] = "http://v.youku.com/v_show/id_XMjI4MDM4NDc2.html";

  * $urls[] = "http://www.tudou.com/playlist/p/l13087099.html";

  * $urls[] = "http://www.tudou.com/programs/view/ufg-A3tlcxk/";

  * $urls[] = "http://v.ku6.com/special/show_4926690/Klze2mhMeSK6g05X.html";

  * $urls[] = "http://www.56.com/u68/v_NjI2NTkxMzc.html";

  * $urls[] = "http://www.letv.com/ptv/vplay/1168109.html";

  * $urls[] = "http://video.sina.com.cn/v/b/46909166-1290055681.html";

  *

  * foreach($urls as $url){

  * $info = VideoUrlParser::parse($url);

  * //var_dump($info);

  * echo "<a href='{$info['url']}' target='_new'>{$info['title']}</a>";

  * echo "<br />";

  * echo $info['object'];

  * echo "<br />";

  * }

  *

  *

  *

  * //优酷

  * http://v.youku.com/v_show/id_XMjU0NjY4OTEy.html

  * <embed src="http://player.youku.com/player.php/sid/XMjU0NjY4OTEy/v.swf" quality="high" width="480" height="400" align="middle" allowScriptAccess="sameDomain" type="application/x-shockwave-flash"></embed>

  *

  * //酷六

  * http://v.ku6.com/special/show_3917484/x0BMXAbgZdQS6FqN.html

  * <embed src="http://player.ku6.com/refer/x0BMXAbgZdQS6FqN/v.swf" quality="high" width="480" height="400" align="middle" allowScriptAccess="always" allowfullscreen="true" type="application/x-shockwave-flash"></embed>

  *

  * //土豆

  * http://www.tudou.com/playlist/p/a65929.html?iid=74905844

  * <embed src="http://www.tudou.com/l/A_0urj-Geec/&iid=74905844/v.swf" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" wmode="opaque" width="480" height="400"></embed>

  *

  * //56

  * http://www.56.com/u98/v_NTkyODY2NTU.html

  * <embed src="http://player.56.com/v_NTkyODY2NTU.swf" type="application/x-shockwave-flash" width="480" height="405" allowNetworking="all" allowScriptAccess="always"></embed>

  *

  * //新浪播客

  * http://video.sina.com.cn/v/b/46909166-1290055681.html

  * <embed src="http://you.video.sina.com.cn/api/sinawebApi/outplayrefer.php/vid=46909166_1290055681_b0K1GHEwDWbK+l1l

  Hz2stqkP7KQNt6nki2O0u1ehIwZYQ0/XM5GdZNQH6SjQBtkEqDhAQJ42dfcn0Rs/s.swf" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash" name="ssss" allowFullScreen="true" allowScriptAccess="always" width="480" height="370"></embed>

  *

  * //乐视

  * http://www.letv.com/ptv/vplay/1168109.html

  * <embed src="http://i3.imgs.letv.com/player/swfPlayer.swf?id=1168109&host=app.letv.com&vstatus=1&AP=1&logoMask=0&isShowP2p=0&autoplay=true" quality="high" scale="NO_SCALE" wmode="opaque" bgcolor="#000000" width="480" height="388" name="FLV_player" align="middle" allowscriptaccess="always" allowfullscreen="true" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer">

  */

  class VideoUrlParser

  {

  const USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko)

  Chrome/8.0.552.224 Safari/534.10";

  const CHECK_URL_VALID = "/(youku.com|tudou.com|ku6.com|56.com|letv.com|video.sina.com.cn|(my.)?tv.sohu.com|v.qq.com)/";

  /**

  * parse

  *

  * @param string $url

  * @param mixed $createObject

  * @static

  * @access public

  * @return void

  */

  static public function parse($url='', $createObject=true){

  $lowerurl = strtolower($url);

  preg_match(self::CHECK_URL_VALID, $lowerurl, $matches);

  if(!$matches) return false;

  switch($matches[1]){

  case 'youku.com':

  $data = self::_parseYouku($url);

  break;

  case 'tudou.com':

  $data = self::_parseTudou($url);

  break;

  case 'ku6.com':

  $data = self::_parseKu6($url);

  break;

  case '56.com':

  $data = self::_parse56($url);

  break;

  case 'letv.com':

  $data = self::_parseLetv($url);

  break;

  case 'video.sina.com.cn':

  $data = self::_parseSina($url);

  break;

  case 'my.tv.sohu.com':

  case 'tv.sohu.com':

  case 'sohu.com':

  $data = self::_parseSohu($url);

  break;

  case 'v.qq.com':

  $data = self::_parseQq($url);

  break;

  default:

  $data = false;

  }

  if($data && $createObject) $data['object'] = "<embed src="{$data['swf']}" quality="high" width="480" height="40

  0" align="middle" allowNetworking="all" allowScriptAccess="always" type="application/x-shockwave-flash"></embed>";

  return $data;

  }

  /**

  * 腾讯视频

  * http://v.qq.com/cover/o/o9tab7nuu0q3esh.html?vid=97abu74o4w3_0

  * http://v.qq.com/play/97abu74o4w3.html

  * http://v.qq.com/cover/d/dtdqyd8g7xvoj0o.html

  * http://v.qq.com/cover/d/dtdqyd8g7xvoj0o/9SfqULsrtSb.html

  * http://imgcache.qq.com/tencentvideo_v1/player/TencentPlayer.swf?_v=20110829&vid=97abu74o4w3&autoplay=1&list=2&showcfg=1&tpid=23&title=%E7%AC%AC%E4%B8%80%E7%8E

  %B0%E5%9C%BA&adplay=1&cid=o9tab7nuu0q3esh

  */

  private function _parseQq($url){

  if(preg_match("//play//", $url)){

  $html = self::_fget($url);

  preg_match("/url=[^"]+/", $html, $matches);

  if(!$matches); return false;

  $url = $matches[0];

  }

  preg_match("/vid=([^_]+)/", $url, $matches);

  $vid = $matches[1];

  $html = self::_fget($url);

  // query

  preg_match("/flashvarss=s"([^;]+)/s", $html, $matches);

  $query = $matches[1];

  if(!$vid){

  preg_match("/vids?=s?vids?||s?"(w+)";/i", $html, $matches);

  $vid = $matches[1];

  }

  $query = str_replace('"+vid+"', $vid, $query);

  parse_str($query, $output);

  $data['img'] = "http://vpic.video.qq.com/{$$output['cid']}/{$vid}_1.jpg";

  $data['url'] = $url;

  $data['title'] = $output['title'];

  $data['swf'] = "http://imgcache.qq.com/tencentvideo_v1/player/TencentPlayer.swf?".$query;

  return $data;

  }

   

  /**

  * 优酷网

  * http://v.youku.com/v_show/id_XMjI4MDM4NDc2.html

  * http://player.youku.com/player.php/sid/XMjU0NjI2Njg4/v.swf

  */

  private function _parseYouku($url){

  preg_match("#id_(w+)#", $url, $matches);

  if (empty($matches)){

  preg_match("#v_playlist/#", $url, $mat);

  if(!$mat) return false;

  $html = self::_fget($url);

  preg_match("#videoId2s*=s*'(w+)'#", $html, $matches);

  if(!$matches) return false;

  }

  $link = "http://v.youku.com/player/getPlayList/VideoIDS/{$matches[1]}/timezone/+08/version/5/source/out?password=&ran=2513&n=3";

  $retval = self::_cget($link);

  if ($retval) {

  $json = json_decode($retval, true);

  $data['img'] = $json['data'][0]['logo'];

  $data['title'] = $json['data'][0]['title'];

  $data['url'] = $url;

  $data['swf'] = "http://player.youku.com/player.php/sid/{$matches[1]}/v.swf";

  return $data;

  } else {

  return false;

  }

  }

  /**

  * 土豆网

  * http://www.tudou.com/programs/view/Wtt3FjiDxEE/

  * http://www.tudou.com/v/Wtt3FjiDxEE/v.swf

  *

  * http://www.tudou.com/playlist/p/a65718.html?iid=74909603

  * http://www.tudou.com/l/G5BzgI4lAb8/&iid=74909603/v.swf

  */

  private function _parseTudou($url){

  preg_match("#view/([-w]+)/#", $url, $matches);

  if (empty($matches)) {

  if (strpos($url, "/playlist/") == false) return false;

  if(strpos($url, 'iid=') !== false){

  $quarr = explode("iid=", $lowerurl);

  if (empty($quarr[1])) return false;

  }elseif(preg_match("#p/l(d+).#", $lowerurl, $quarr)){

  if (empty($quarr[1])) return false;

  }

  $html = self::_fget($url);

  $html = iconv("GB2312", "UTF-8", $html);

  preg_match("/lid_codes=slcodes=s['"]([^'"]+)/s", $html, $matches);

  $icode = $matches[1];

  preg_match("/iids=s.*?||s(d+)/sx", $html, $matches);

  $iid = $matches[1];

  preg_match("/listDatas=s([{.*}])/sx", $html, $matches);

  $find = array("/n/", '/s/', "/:[^d"]w+[^,]*,/i", "/({|,)(w+):/");

  $replace = array("", "", ':"",', '1"2":');

  $str = preg_replace($find, $replace, $matches[1]);

  //var_dump($str);

  $json = json_decode($str);

  //var_dump($json);exit;

  if(is_array($json) || is_object($json) && !empty($json)){

  foreach ($json as $val) {

  if ($val->iid == $iid) {

  break;

  }

  }

  }

  $data['img'] = $val->pic;

  $data['title'] = $val->title;

  $data['url'] = $url;

  $data['swf'] = "http://www.tudou.com/l/{$icode}/&iid={$iid}/v.swf";

  return $data;

  }

  $host = "www.tudou.com";

  $path = "/v/{$matches[1]}/v.swf";

  $ret = self::_fsget($path, $host);

  if (preg_match("#nLocation: (.*)n#", $ret, $mat)) {

  parse_str(parse_url(urldecode($mat[1]), PHP_URL_QUERY));

  $data['img'] = $snap_pic;

  $data['title'] = $title;

  $data['url'] = $url;

  $data['swf'] = "http://www.tudou.com/v/{$matches[1]}/v.swf";

  return $data;

  }

  return false;

  }

  /**

  * 酷6网

  * http://v.ku6.com/film/show_520/3X93vo4tIS7uotHg.html

  * http://v.ku6.com/special/show_4926690/Klze2mhMeSK6g05X.html

  * http://v.ku6.com/show/7US-kDXjyKyIInDevhpwHg...html

  * http://player.ku6.com/refer/3X93vo4tIS7uotHg/v.swf

  */

  private function _parseKu6($url){

  if(preg_match("/show_/", $url)){

  preg_match("#/([-w]+).html#", $url, $matches);

  $url = "http://v.ku6.com/fetchVideo4Player/{$matches[1]}.html";

  $html = self::_fget($url);

  if ($html) {

  $json = json_decode($html, true);

  if(!$json) return false;

  $data['img'] = $json['data']['picpath'];

  $data['title'] = $json['data']['t'];

  $data['url'] = $url;

  $data['swf'] = "http://player.ku6.com/refer/{$matches[1]}/v.swf";

  return $data;

  } else {

  return false;

  }

  }elseif(preg_match("/show//", $url, $matches)){

  $html = self::_fget($url);

  preg_match("/ObjectInfos?=s?([^n]*)};/si", $html, $matches);

  $str = $matches[1];

  // img

  preg_match("/covers?:s?"([^"]+)"/", $str, $matches);

  $data['img'] = $matches[1];

  // title

  preg_match("/title"?s?:s?"([^"]+)"/", $str, $matches);

  $jsstr = "{"title":"{$matches[1]}"}";

  $json = json_decode($jsstr, true);

  $data['title'] = $json['title'];

  // url

  $data['url'] = $url;

  // query

  preg_match("/"(vid=[^"]+)"sname="flashVars"/s", $html, $matches);

  $query = str_replace("&", '&', $matches[1]);

  preg_match("///player.ku6cdn.com[^"']+/", $html, $matches);

  $data['swf'] = 'http:'.$matches[0].'?'.$query;

  return $data;

  }

  }

  /**

  * 56网

  * http://www.56.com/u73/v_NTkzMDcwNDY.html

  * http://player.56.com/v_NTkzMDcwNDY.swf

  */

  private function _parse56($url){

  preg_match("#/v_(w+).html#", $url, $matches);

  if (empty($matches)) return false;

  $link="http://vxml.56.com/json/{$matches[1]}/?src=out";

  $retval = self::_cget($link);

  if ($retval) {

  $json = json_decode($retval, true);

  $data['img'] = $json['info']['img'];

  $data['title'] = $json['info']['Subject'];

  $data['url'] = $url;

  $data['swf'] = "http://player.56.com/v_{$matches[1]}.swf";

  return $data;

  } else {

  return false;

  }

  }

  /**

  * 乐视网

  * http://www.letv.com/ptv/vplay/1168109.html

  * http://www.letv.com/player/x1168109.swf

  */

  private function _parseLetv($url){

  $html = self::_fget($url);

  preg_match("#http://v.t.sina.com.cn/([^'"]*)#", $html, $matches);

  parse_str(parse_url(urldecode($matches[0]), PHP_URL_QUERY));

  preg_match("#vplay/(d+)#", $url, $matches);

  $data['img'] = $pic;

  $data['title'] = $title;

  $data['url'] = $url;

  $data['swf'] = "http://www.letv.com/player/x{$matches[1]}.swf";

  return $data;

  }

  // 搜狐TV http://my.tv.sohu.com/u/vw/5101536

  private function _parseSohu($url){

  $html = self::_fget($url);

  $html = iconv("GB2312", "UTF-8", $html);

  preg_match_all("/og:(?:title|image|videosrc)"scontent="([^"]+)"/s", $html, $matches);

  $data['img'] = $matches[1][1];

  $data['title'] = $matches[1][0];

  $data['url'] = $url;

  $data['swf'] = $matches[1][2];

  return $data;

  }

  /*

  * 新浪播客

  * http://video.sina.com.cn/v/b/48717043-1290055681.html

  * http://you.video.sina.com.cn/api/sinawebApi/outplayrefer.php/vid=48717043_1290055681_PUzkSndrDzXK+l1lHz2stqk

  P7KQNt6nki2O0u1ehIwZYQ0/XM5GdatoG5ynSA9kEqDhAQJA4dPkm0x4/s.swf

  */

  private function _parseSina($url){

  preg_match("/(d+)(?:-|_)(d+)/", $url, $matches);

  $url = "http://video.sina.com.cn/v/b/{$matches[1]}-{$matches[2]}.html";

  $html = self::_fget($url);

  preg_match("/videos?:s?([^<]+)}/", $html, $matches);

  $find = array("/n/", "/s*/", "/'/", "/{([^:,]+):/", "/,([^:]+):/", "/:[^d"]w+[^,]*,/i");

  $replace = array('', '', '"', '{"1":', ',"1":', ':"",');

  $str = preg_replace($find, $replace, $matches[1]);

  $arr = json_decode($str, true);

  $data['img'] = $arr['pic'];

  $data['title'] = $arr['title'];

  $data['url'] = $url;

  $data['swf'] = $arr['swfOutsideUrl'];

  return $data;

  }

  /*

  * 通过 file_get_contents 获取内容

  */

  private function _fget($url=''){

  if(!$url) return false;

  $html = file_get_contents($url);

  // 判断是否gzip压缩

  if($dehtml = self::_gzdecode($html))

  return $dehtml;

  else

  return $html;

  }

  /*

  * 通过 fsockopen 获取内容

  */

  private function _fsget($path='/', $host='', $user_agent=''){

  if(!$path || !$host) return false;

  $user_agent = $user_agent ? $user_agent : self::USER_AGENT;

  $out = <<<HEADER

  GET $path HTTP/1.1

  Host: $host

  User-Agent: $user_agent

  Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8

  Accept-Language: zh-cn,zh;q=0.5

  Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7rnrn

  HEADER;

  $fp = @fsockopen($host, 80, $errno, $errstr, 10);

  if (!$fp) return false;

  if(!fputs($fp, $out)) return false;

  while ( !feof($fp) ) {

  $html .= fgets($fp, 1024);

  }

  fclose($fp);

  // 判断是否gzip压缩

  if($dehtml = self::_gzdecode($html))

  return $dehtml;

  else

  return $html;

  }

  /*

  * 通过 curl 获取内容

  */

  private function _cget($url='', $user_agent=''){

  if(!$url) return;

  $user_agent = $user_agent ? $user_agent : self::USER_AGENT;

  $ch = curl_init();

  curl_setopt($ch, CURLOPT_URL, $url);

  curl_setopt($ch, CURLOPT_HEADER, 0);

  if(strlen($user_agent)) curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);

  ob_start();

  curl_exec($ch);

  $html = ob_get_contents();

  ob_end_clean();

  if(curl_errno($ch)){

  curl_close($ch);

  return false;

  }

  curl_close($ch);

  if(!is_string($html) || !strlen($html)){

  return false;

  }

  return $html;

  // 判断是否gzip压缩

  if($dehtml = self::_gzdecode($html))

  return $dehtml;

  else

  return $html;

  }

  private function _gzdecode($data) {

  $len = strlen ( $data );

  if ($len < 18 || strcmp ( substr ( $data, 0, 2 ), "x1fx8b" )) {

  return null; // Not GZIP format (See RFC 1952)

  }

  $method = ord ( substr ( $data, 2, 1 ) ); // Compression method

  $flags = ord ( substr ( $data, 3, 1 ) ); // Flags

  if ($flags & 31 != $flags) {

  // Reserved bits are set -- NOT ALLOWED by RFC 1952

  return null;

  }

  // NOTE: $mtime may be negative (PHP integer limitations)

  $mtime = unpack ( "V", substr ( $data, 4, 4 ) );

  $mtime = $mtime [1];

  $xfl = substr ( $data, 8, 1 );

  $os = substr ( $data, 8, 1 );

  $headerlen = 10;

  $extralen = 0;

  $extra = "";

  if ($flags & 4) {

  // 2-byte length prefixed EXTRA data in header

  if ($len - $headerlen - 2 < 8) {

  return false; // Invalid format

  }

  $extralen = unpack ( "v", substr ( $data, 8, 2 ) );

  $extralen = $extralen [1];

  if ($len - $headerlen - 2 - $extralen < 8) {

  return false; // Invalid format

  }

  $extra = substr ( $data, 10, $extralen );

  $headerlen += 2 + $extralen;

  }

  $filenamelen = 0;

  $filename = "";

  if ($flags & 8) {

  // C-style string file NAME data in header

  if ($len - $headerlen - 1 < 8) {

  return false; // Invalid format

  }

  $filenamelen = strpos ( substr ( $data, 8 + $extralen ), chr ( 0 ) );

  if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {

  return false; // Invalid format

  }

  $filename = substr ( $data, $headerlen, $filenamelen );

  $headerlen += $filenamelen + 1;

  }

  $commentlen = 0;

  $comment = "";

  if ($flags & 16) {

  // C-style string COMMENT data in header

  if ($len - $headerlen - 1 < 8) {

  return false; // Invalid format

  }

  $commentlen = strpos ( substr ( $data, 8 + $extralen + $filenamelen ), chr ( 0 ) );

  if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {

  return false; // Invalid header format

  }

  $comment = substr ( $data, $headerlen, $commentlen );

  $headerlen += $commentlen + 1;

  }

  $headercrc = "";

  if ($flags & 1) {

  // 2-bytes (lowest order) of CRC32 on header present

  if ($len - $headerlen - 2 < 8) {

  return false; // Invalid format

  }

  $calccrc = crc32 ( substr ( $data, 0, $headerlen ) ) & 0xffff;

  $headercrc = unpack ( "v", substr ( $data, $headerlen, 2 ) );

  $headercrc = $headercrc [1];

  if ($headercrc != $calccrc) {

  return false; // Bad header CRC

  }

  $headerlen += 2;

  }

  // GZIP FOOTER - These be negative due to PHP's limitations

  $datacrc = unpack ( "V", substr ( $data, - 8, 4 ) );

  $datacrc = $datacrc [1];

  $isize = unpack ( "V", substr ( $data, - 4 ) );

  $isize = $isize [1];

  // Perform the decompression:

  $bodylen = $len - $headerlen - 8;

  if ($bodylen < 1) {

  // This should never happen - IMPLEMENTATION BUG!

  return null;

  }

  $body = substr ( $data, $headerlen, $bodylen );

  $data = "";

  if ($bodylen > 0) {

  switch ($method) {

  case 8 :

  // Currently the only supported compression method:

  $data = gzinflate ( $body );

  break;

  default :

  // Unknown compression method

  return false;

  }

  } else {

  //...

  }

  if ($isize != strlen ( $data ) || crc32 ( $data ) != $datacrc) {

  // Bad format! Length or CRC doesn't match!

  return false;

  }

  return $data;

  }

  }
分享到:
本文"PHP抓取、分析国内视频网站的视频信息工具类"由远航站长收集整理而来,仅供大家学习与参考使用。更多网站制作教程尽在远航站长站。
顶一下
(0)
0%
踩一下
(0)
0%
[点击 次] [返回上一页] [打印]
发表评论
请自觉遵守互联网相关的政策法规,严禁发布色情、暴力、反动的言论。
评价:
表情:
用户名: 密码: 验证码:
关于本站 - 联系我们 - 网站声明 - 友情连接- 网站地图 - 站点地图 - 返回顶部
Copyright © 2007-2013 www.yhzhan.com(远航站长). All Rights Reserved .
远航站长:为中小站长提供最佳的学习与交流平台,提供网页制作与网站编程等各类网站制作教程.
官方QQ:445490277 网站群:26680406 网站备案号:豫ICP备07500620号-4