使用PHP和MySQL实现一个新闻爬虫
更新时间:2023-10-23第一段
在开始实现新闻爬虫之前,我们需要明确需要哪些技术来完成它。一般来说,一个爬虫需要以下几个模块:
- URL管理器
- 网页下载器
- 网页解析器
- 数据存储器
URL管理器用于管理待爬取和已爬取的URL,网页下载器用于下载网页,网页解析器用于解析网页的结构,数据存储器用于存储解析后的数据。
class UrlManager {
protected $newUrls = array();
protected $visitedUrls = array();
public function addUrl($url) {
if (!in_array($url, $this->newUrls) && !in_array($url, $this->visitedUrls)) {
$this->newUrls[] = $url;
}
}
public function getNewUrl() {
if (count($this->newUrls) > 0) {
$url = array_shift($this->newUrls);
$this->visitedUrls[] = $url;
return $url;
}
return null;
}
}
class HtmlDownloader {
public function download($url) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
$html = curl_exec($curl);
curl_close($curl);
return $html;
}
}
class HtmlParser {
public function parse($html) {
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$news = array();
$titles = $xpath->query('//h2');
$contents = $xpath->query('//div[@class="content"]');
for ($i = 0; $i < $titles->length; $i++) {
$title = $titles->item($i)->textContent;
$content = $contents->item($i)->textContent;
$news[] = array('title' => $title, 'content' => $content);
}
return $news;
}
}
class MysqlStorage {
protected $connection;
public function __construct($host, $username, $password, $database) {
$this->connection = mysqli_connect($host, $username, $password, $database);
if (!$this->connection) {
die('Connect Error (' . mysqli_connect_errno() . ') ' . mysqli_connect_error());
}
}
public function store($news) {
foreach ($news as $item) {
$title = mysqli_real_escape_string($this->connection, $item['title']);
$content = mysqli_real_escape_string($this->connection, $item['content']);
$sql = "INSERT INTO news (title, content) VALUES ('$title', '$content')";
mysqli_query($this->connection, $sql);
}
}
public function close() {
mysqli_close($this->connection);
}
}
$urlManager = new UrlManager();
$urlManager->addUrl('http://www.example.com/news/1');
$htmlDownloader = new HtmlDownloader();
$htmlParser = new HtmlParser();
$mysqlStorage = new MysqlStorage('localhost', 'root', 'password', 'news_db');
while (true) {
$url = $urlManager->getNewUrl();
if (!$url) {
break;
}
$html = $htmlDownloader->download($url);
$news = $htmlParser->parse($html);
$mysqlStorage->store($news);
}
$mysqlStorage->close();
第二段
针对新闻爬虫的实现,我们还需要考虑一些细节。例如,我们在解析时可能需要忽略某些内容或只解析某些特定类型的内容。此外,为了避免下载重复的内容,我们还需要设计一个去重的机制。另外,为了防止被网站屏蔽,我们需要合理地控制访问速度,并精简提交内容。
class HtmlParser {
// ...
public function parse($html) {
// ...
$tagsToIgnore = array('script', 'style');
foreach ($tagsToIgnore as $tagName) {
$tags = $dom->getElementsByTagName($tagName);
for ($i = $tags->length - 1; $i >= 0; $i--) {
$tags->item($i)->parentNode->removeChild($tags->item($i));
}
}
$news = array();
$titlePrefix = '新闻标题:';
$titlePostfix = ' - 新华网';
foreach ($titles as $title) {
$titleText = trim(str_replace($titlePrefix, '', str_replace($titlePostfix, '', $title->textContent)));
if (strlen($titleText) > 0) {
$content = $title->parentNode->nextSibling->textContent;
if (strpos($content, '原标题') === 0) {
$content = trim(substr($content, strpos($content, "\n")));
}
$news[] = array('title' => $titleText, 'content' => $content);
}
}
return $news;
}
}
class UrlManager {
// ...
protected $hostname = 'www.example.com';
protected $visitingDelay = 1;
protected $userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36';
protected function isInternalLink($link) {
return strpos($link, $this->hostname) !== false;
}
protected function delay() {
sleep($this->visitingDelay);
}
public function visit($url) {
if (!$this->isInternalLink($url)) {
return;
}
$html = $this->download($url);
if ($html === false) {
return;
}
$news = $this->parse($html);
$this->store($news);
$this->addUrls($this->extractUrls($html));
$this->delay();
}
public function addUrls($urls) {
foreach ($urls as $url) {
$this->addUrl($url);
}
}
public function download($url) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_USERAGENT, $this->userAgent);
$html = curl_exec($curl);
$info = curl_getinfo($curl);
curl_close($curl);
if ($info['http_code'] != 200) {
return false;
}
return $html;
}
}
$urlManager = new UrlManager();
$urlManager->addUrl('http://www.example.com/');
$urlManager->visit('http://www.example.com/');
第三段
除了常规的爬虫功能外,我们还可以使用一些高级技术来提高爬虫的效率和可靠性。例如,在爬取高频率更新的网站时,我们可以使用异步IO技术进行更快的下载和解析。另外,对于有些网站,我们需要对爬虫进行身份认证或使用代理。此外,我们还可以通过使用用户代理或网络代理来保护我们的IP地址,并且限制每个爬虫的访问速度和频率。
class AsyncHtmlDownloader {
public function download($urls, $callback) {
$handles = array();
$mh = curl_multi_init();
foreach ($urls as $url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$handles[$url] = $ch;
curl_multi_add_handle($mh, $ch);
}
do {
$status = curl_multi_exec($mh, $active);
} while ($status === CURLM_CALL_MULTI_PERFORM || $active);
foreach ($handles as $url => $ch) {
$html = curl_multi_getcontent($ch);
curl_multi_remove_handle($mh, $ch);
call_user_func($callback, $url, $html);
}
curl_multi_close($mh);
}
}
$urls = array('http://news.sina.com.cn/', 'http://www.xinhuanet.com/', 'http://www.news.cn/',);
$asyncDownloader = new AsyncHtmlDownloader();
$asyncDownloader->download($urls, function ($url, $html) {
$news = $htmlParser->parse($html);
$mysqlStorage->store($news);
});
第四段
最后,为了保证爬虫的质量和稳定性,我们需要仔细检查和测试代码,并进行反爬虫策略。针对反爬虫策略,我们可以采取一些措施来规避或减轻,例如改变访问频率,使用IP代理,进行字体反爬等。不过,我们需要注意,规避反爬机制是一项需要谨慎对待的工作,应该遵守道德和法律准则,并且严格限制使用范围,否则可能会面临被网站封禁或法律诉讼等风险。
class HtmlAntiCrawler {
public function download($url) {
$html = $this->obfuscateHtml($htmlDownloader->download($url));
$html = $this->decodeHtml($html);
return $html;
}
protected function obfuscateHtml($html) {
$pattern = '/(.*?)<\/span>/';
preg_match_all($pattern, $html, $matches);
for ($i = count($matches[0]) - 1; $i >= 0; $i--) {
$encoded = $this->encode($matches[2][$i]);
$html = substr_replace($html, $encoded, strpos($html, $matches[0][$i]), strlen($matches[0][$i]));
}
return $html;
}
protected function decodeHtml($html) {
$pattern = '/";
$html = substr_replace($html, $script, strpos($html, $matches[0]), strlen($matches[0]));
return $html;
}
// 根据字体映射表解码字体行
protected function decodeFontLine($line) {
$map = array(
// ...
);
$text = preg_replace('/\s+/', '', $line);
$decoded = '';
for ($i = 0; $i < strlen($text); $i += 2) {
$code = hexdec($text[$i] . $text[$i + 1]);
$decoded .= isset($map[$code]) ? $map[$code] : chr($code);
}
return $decoded;
}
// 获取字体映射表(需要替换参数)
protected function getFontMap($url, $fontUrl, $fontPath) {
$html = $htmlDownloader->download($url);
$fontFile = '/path/to/font/file';
if (!file_exists($fontFile)) {
$font = file_get_contents($fontUrl);
file_put_contents($fontFile, $font);
}
$content = file_get_contents($fontFile);
$offset = strpos($content, 'CMap');
$length = strpos($content, 'ToUnicode') - $offset;
$cmap = substr($content, $offset, $length);
$string = file_get_contents($fontPath);
preg_match_all('/\d{2,3}/', $string, $matches);
$codes = $matches[0];
$map = array();
foreach ($codes as $code) {
$pattern = '/' . $code . '\">(.+?)<\/span>/';
preg_match($pattern, $cmap, $match);
if (isset($match[1])) {
$map[hexdec($code)] = $this->decodeFontLine($match[1]);
}
}
return $map;
}
}
$htmlAntiCrawler = new HtmlAntiCrawler();
$html = $htmlAntiCrawler->download('http://www.example.com/');