c语言编程笔录

首页 >   > 笔记大全

笔记大全

使用PHP和MySQL实现一个新闻爬虫

更新时间:2023-10-23

第一段

在开始实现新闻爬虫之前,我们需要明确需要哪些技术来完成它。一般来说,一个爬虫需要以下几个模块:

  • URL管理器
  • 网页下载器
  • 网页解析器
  • 数据存储器

URL管理器用于管理待爬取和已爬取的URL,网页下载器用于下载网页,网页解析器用于解析网页的结构,数据存储器用于存储解析后的数据。


class UrlManager {
  protected $newUrls = array();
  protected $visitedUrls = array();

  public function addUrl($url) {
    if (!in_array($url, $this->newUrls) && !in_array($url, $this->visitedUrls)) {
      $this->newUrls[] = $url;
    }
  }

  public function getNewUrl() {
    if (count($this->newUrls) > 0) {
      $url = array_shift($this->newUrls);
      $this->visitedUrls[] = $url;
      return $url;
    }
    return null;
  }
}

class HtmlDownloader {
  public function download($url) {
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_HEADER, false);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    $html = curl_exec($curl);
    curl_close($curl);
    return $html;
  }
}

class HtmlParser {
  public function parse($html) {
    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $xpath = new DOMXPath($dom);
    $news = array();
    $titles = $xpath->query('//h2');
    $contents = $xpath->query('//div[@class="content"]');
    for ($i = 0; $i < $titles->length; $i++) {
      $title = $titles->item($i)->textContent;
      $content = $contents->item($i)->textContent;
      $news[] = array('title' => $title, 'content' => $content);
    }
    return $news;
  }
}

class MysqlStorage {
  protected $connection;

  public function __construct($host, $username, $password, $database) {
    $this->connection = mysqli_connect($host, $username, $password, $database);
    if (!$this->connection) {
      die('Connect Error (' . mysqli_connect_errno() . ') ' . mysqli_connect_error());
    }
  }

  public function store($news) {
    foreach ($news as $item) {
      $title = mysqli_real_escape_string($this->connection, $item['title']);
      $content = mysqli_real_escape_string($this->connection, $item['content']);
      $sql = "INSERT INTO news (title, content) VALUES ('$title', '$content')";
      mysqli_query($this->connection, $sql);
    }
  }

  public function close() {
    mysqli_close($this->connection);
  }
}

$urlManager = new UrlManager();
$urlManager->addUrl('http://www.example.com/news/1');
$htmlDownloader = new HtmlDownloader();
$htmlParser = new HtmlParser();
$mysqlStorage = new MysqlStorage('localhost', 'root', 'password', 'news_db');

while (true) {
  $url = $urlManager->getNewUrl();
  if (!$url) {
    break;
  }
  $html = $htmlDownloader->download($url);
  $news = $htmlParser->parse($html);
  $mysqlStorage->store($news);
}
$mysqlStorage->close();

第二段

针对新闻爬虫的实现,我们还需要考虑一些细节。例如,我们在解析时可能需要忽略某些内容或只解析某些特定类型的内容。此外,为了避免下载重复的内容,我们还需要设计一个去重的机制。另外,为了防止被网站屏蔽,我们需要合理地控制访问速度,并精简提交内容。


class HtmlParser {
  // ...
  public function parse($html) {
    // ...
    $tagsToIgnore = array('script', 'style');
    foreach ($tagsToIgnore as $tagName) {
      $tags = $dom->getElementsByTagName($tagName);
      for ($i = $tags->length - 1; $i >= 0; $i--) {
        $tags->item($i)->parentNode->removeChild($tags->item($i));
      }
    }
    $news = array();
    $titlePrefix = '新闻标题:';
    $titlePostfix = ' - 新华网';
    foreach ($titles as $title) {
      $titleText = trim(str_replace($titlePrefix, '', str_replace($titlePostfix, '', $title->textContent)));
      if (strlen($titleText) > 0) {
        $content = $title->parentNode->nextSibling->textContent;
        if (strpos($content, '原标题') === 0) {
          $content = trim(substr($content, strpos($content, "\n")));
        }
        $news[] = array('title' => $titleText, 'content' => $content);
      }
    }
    return $news;
  }
}

class UrlManager {
  // ...
  protected $hostname = 'www.example.com';
  protected $visitingDelay = 1;
  protected $userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36';

  protected function isInternalLink($link) {
    return strpos($link, $this->hostname) !== false;
  }

  protected function delay() {
    sleep($this->visitingDelay);
  }

  public function visit($url) {
    if (!$this->isInternalLink($url)) {
      return;
    }
    $html = $this->download($url);
    if ($html === false) {
      return;
    }
    $news = $this->parse($html);
    $this->store($news);
    $this->addUrls($this->extractUrls($html));
    $this->delay();
  }

  public function addUrls($urls) {
    foreach ($urls as $url) {
      $this->addUrl($url);
    }
  }

  public function download($url) {
    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_HEADER, false);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_USERAGENT, $this->userAgent);
    $html = curl_exec($curl);
    $info = curl_getinfo($curl);
    curl_close($curl);
    if ($info['http_code'] != 200) {
      return false;
    }
    return $html;
  }
}

$urlManager = new UrlManager();
$urlManager->addUrl('http://www.example.com/');
$urlManager->visit('http://www.example.com/');

第三段

除了常规的爬虫功能外,我们还可以使用一些高级技术来提高爬虫的效率和可靠性。例如,在爬取高频率更新的网站时,我们可以使用异步IO技术进行更快的下载和解析。另外,对于有些网站,我们需要对爬虫进行身份认证或使用代理。此外,我们还可以通过使用用户代理或网络代理来保护我们的IP地址,并且限制每个爬虫的访问速度和频率。


class AsyncHtmlDownloader {
  public function download($urls, $callback) {
    $handles = array();
    $mh = curl_multi_init();
    foreach ($urls as $url) {
      $ch = curl_init();
      curl_setopt($ch, CURLOPT_URL, $url);
      curl_setopt($ch, CURLOPT_HEADER, false);
      curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
      curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
      $handles[$url] = $ch;
      curl_multi_add_handle($mh, $ch);
    }
    do {
      $status = curl_multi_exec($mh, $active);
    } while ($status === CURLM_CALL_MULTI_PERFORM || $active);
    foreach ($handles as $url => $ch) {
      $html = curl_multi_getcontent($ch);
      curl_multi_remove_handle($mh, $ch);
      call_user_func($callback, $url, $html);
    }
    curl_multi_close($mh);
  }
}

$urls = array('http://news.sina.com.cn/', 'http://www.xinhuanet.com/', 'http://www.news.cn/',);
$asyncDownloader = new AsyncHtmlDownloader();
$asyncDownloader->download($urls, function ($url, $html) {
  $news = $htmlParser->parse($html);
  $mysqlStorage->store($news);
});

第四段

最后,为了保证爬虫的质量和稳定性,我们需要仔细检查和测试代码,并进行反爬虫策略。针对反爬虫策略,我们可以采取一些措施来规避或减轻,例如改变访问频率,使用IP代理,进行字体反爬等。不过,我们需要注意,规避反爬机制是一项需要谨慎对待的工作,应该遵守道德和法律准则,并且严格限制使用范围,否则可能会面临被网站封禁或法律诉讼等风险。


class HtmlAntiCrawler {
  public function download($url) {
    $html = $this->obfuscateHtml($htmlDownloader->download($url));
    $html = $this->decodeHtml($html);
    return $html;
  }

  protected function obfuscateHtml($html) {
    $pattern = '/(.*?)<\/span>/';
    preg_match_all($pattern, $html, $matches);
    for ($i = count($matches[0]) - 1; $i >= 0; $i--) {
      $encoded = $this->encode($matches[2][$i]);
      $html = substr_replace($html, $encoded, strpos($html, $matches[0][$i]), strlen($matches[0][$i]));
    }
    return $html;
  }

  protected function decodeHtml($html) {
    $pattern = '/";
    $html = substr_replace($html, $script, strpos($html, $matches[0]), strlen($matches[0]));
    return $html;
  }

  // 根据字体映射表解码字体行
  protected function decodeFontLine($line) {
    $map = array(
      // ...
    );
    $text = preg_replace('/\s+/', '', $line);
    $decoded = '';
    for ($i = 0; $i < strlen($text); $i += 2) {
      $code = hexdec($text[$i] . $text[$i + 1]);
      $decoded .= isset($map[$code]) ? $map[$code] : chr($code);
    }
    return $decoded;
  }

  // 获取字体映射表(需要替换参数)
  protected function getFontMap($url, $fontUrl, $fontPath) {
    $html = $htmlDownloader->download($url);
    $fontFile = '/path/to/font/file';
    if (!file_exists($fontFile)) {
      $font = file_get_contents($fontUrl);
      file_put_contents($fontFile, $font);
    }
    $content = file_get_contents($fontFile);
    $offset = strpos($content, 'CMap');
    $length = strpos($content, 'ToUnicode') - $offset;
    $cmap = substr($content, $offset, $length);
    $string = file_get_contents($fontPath);
    preg_match_all('/\d{2,3}/', $string, $matches);
    $codes = $matches[0];
    $map = array();
    foreach ($codes as $code) {
      $pattern = '/' . $code . '\">(.+?)<\/span>/';
      preg_match($pattern, $cmap, $match);
      if (isset($match[1])) {
        $map[hexdec($code)] = $this->decodeFontLine($match[1]);
      }
    }
    return $map;
  }
}

$htmlAntiCrawler = new HtmlAntiCrawler();
$html = $htmlAntiCrawler->download('http://www.example.com/');