WxCrawler.php 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. <?php
  2. namespace extend;
  3. /**
  4. * 微信公众号⽂章爬取类
  5. */
  6. class WxCrawler
  7. {
  8. //微信内容div正则
  9. private $wxContentDiv = "/id=\"js_content\" style=\"visibility: hidden;\">(.*)<\/div>/iUs";
  10. //微信图⽚样式
  11. private $imageStyle = 'style="max-width: 100%;height:auto"';
  12. /**
  13. * 爬取内容
  14. * @param $url
  15. * @return false|string
  16. * @author bignerd
  17. * @since 2016-08-16T10:13:58+0800
  18. */
  19. private function _get($url)
  20. {
  21. return file_get_contents($url);
  22. }
  23. public function crawByUrl($url)
  24. {
  25. $content = $this->_get($url);
  26. if(empty($content)){
  27. return error(-1, '⽂章不存在');
  28. }
  29. $basicInfo = $this->articleBasicInfo($content);
  30. $content_result = $this->contentHandle($content);
  31. if(!empty($content_result['code']) && $content_result['code'] < 0){
  32. return $content_result;
  33. }
  34. list($content_html, $content_text) = $content_result;
  35. return success(0,'',array_merge($basicInfo, ['content_html' => $content_html, 'content_text' => $content_text]));
  36. }
  37. /**
  38. * 处理微信⽂章源码,提取⽂章主体,处理图⽚链接
  39. * @author bignerd
  40. * @since 2016-08-16T15:59:27+0800
  41. * @param $content 抓取的微信⽂章源码
  42. * @return [带图html⽂本,⽆图html⽂本]
  43. */
  44. private function contentHandle($content)
  45. {
  46. $content_html_pattern = $this->wxContentDiv;
  47. preg_match_all($content_html_pattern, $content, $html_matchs);
  48. if (empty(array_filter($html_matchs))) {
  49. return error(-1, '⽂章不存在');
  50. }
  51. $content_html = $html_matchs[1][0];
  52. // $content_html = "<div id='js_content'>".$content_html;
  53. $content_html = "<style>img{max-width:100% !important;height:auto !important}</style>".$content_html;
  54. $content_html = str_replace("preview.html","player.html",$content_html);
  55. //去除掉hidden隐藏
  56. // $content_html = str_replace('style="visibility: hidden;"', '', $content_html);
  57. //过滤掉iframe
  58. // $content_html = preg_replace('/<iframe(.*?)<\/iframe>/', '', $content_html);
  59. // $content_html = preg_replace('/<iframe(.*?)<\/iframe>/', '', $content_html);
  60. $path = 'article/';
  61. /** @var 带图⽚html⽂本 */
  62. $content_html = preg_replace_callback('/data-src="(.*?)"/', function ($matches) use ($path) {
  63. return 'src="' . img($this->getImg($matches[1])) . '" ' ;
  64. }, $content_html);
  65. //添加微信样式
  66. // $content_html = '<div style="max-width: 677px;margin-left: auto;margin-right: auto;">' . $content_html . '</div>';
  67. /** @var ⽆图html⽂本 */
  68. $content_text = preg_replace('/<img.*?>/s', '', $content_html);
  69. return [$content_html, $content_text];
  70. }
  71. /**
  72. * 获取⽂章的基本信息
  73. * @author bignerd
  74. * @since 2016-08-16T17:16:32+0800
  75. * @param $content ⽂章详情源码
  76. * @return $basicInfo
  77. */
  78. private function articleBasicInfo($content)
  79. {
  80. //待获取item
  81. $item = [
  82. 'ct' => 'date',//发布时间
  83. 'msg_title' => 'title',//标题
  84. 'msg_desc' => 'digest',//描述
  85. 'msg_link' => 'content_url',//⽂章链接
  86. 'msg_cdn_url' => 'cover',//封⾯图⽚链接
  87. 'nickname' => 'wechatname',//公众号名称
  88. ];
  89. $basicInfo = [
  90. 'author' => '',
  91. 'copyright_stat' => '',
  92. ];
  93. foreach ($item as $k => $v) {
  94. if ($k == 'msg_title')
  95. $pattern = '/var ' . $k . ' = \'(.*?)\'\.html\(false\);/s';
  96. else
  97. $pattern = '/var ' . $k . ' = "\'(.*?)\'";/s';
  98. preg_match_all($pattern, $content, $matches);
  99. if (array_key_exists(1, $matches) && !empty($matches[1][0])) {
  100. $basicInfo[$v] = trim($this->htmlTransform($matches[1][0]));
  101. } else {
  102. $basicInfo[$v] = '';
  103. }
  104. }
  105. // // 获取作者
  106. // preg_match('/<em class="rich_media_meta rich_media_meta_text">(.*?)<\/em>/s', $content, $matchAuthor);
  107. // if(!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1];
  108. // // ⽂章类型
  109. // preg_match('/<span id="copyright_logo" class="rich_media_meta meta_original_tag">(.*?)<\/span>/s', $content, $matchType);
  110. // if(!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1];
  111. return $basicInfo;
  112. }
  113. /**
  114. * 特殊字符转换
  115. * @author bignerd
  116. * @since 2016-08-16T17:30:52+0800
  117. * @param $string
  118. * @return $string
  119. */
  120. private function htmlTransform($string)
  121. {
  122. $string = str_replace('&quot;', '"', $string);
  123. $string = str_replace('&amp;', '&', $string);
  124. $string = str_replace('amp;', '', $string);
  125. $string = str_replace('&lt;', '<', $string);
  126. $string = str_replace('&gt;', '>', $string);
  127. $string = str_replace('&nbsp;', ' ', $string);
  128. $string = str_replace("\\", '', $string);
  129. return $string;
  130. }
  131. /**
  132. * @param $url
  133. * @return string
  134. */
  135. private function getImg($url)
  136. {
  137. $upload_model = new \app\model\upload\Upload();
  138. $path = 'common/article/' . date('Ymd') . '/';
  139. $result = $upload_model->setPath($path)->remotePull($url);
  140. return $result['data']['pic_path'] ?? '';
  141. // $refer = "http://www.qq.com/";
  142. // $opt = [
  143. // 'http' => [
  144. // 'header' => "Referer: " . $refer
  145. // ]
  146. // ];
  147. // $context = stream_context_create($opt);
  148. // //接受数据流
  149. // $file_contents = file_get_contents($url, false, $context);
  150. // $imageSteam = Imagecreatefromstring($file_contents);
  151. // $path = __UPLOAD__.'article/';
  152. // if (!file_exists($path))
  153. // mkdir($path, 0777, true);
  154. // $fileName = time() . rand(0, 99999) . '.jpg';
  155. // //⽣成新图⽚
  156. // imagejpeg($imageSteam, $path . $fileName);
  157. // return $fileName;
  158. }
  159. }