<?php
class Fork36kr
{
private $start;
private $end;
private $number = 0;
private $dsn;
private $user;
private $password;
private $pdo;
/**
* @param int $start 采集起点文章id
* @param int $end 采集终点文章id
* @param string $dsn PDO数据源
* @param string $user 数据库用户名
* @param string $password 数据库密码
*/
public function __construct($start=200100, $end=206670,$dsn='',$user='',$password='')
{
$this->start = $start;
$this->end = $end;
$this->dsn = $dsn;
$this->user = $user;
$this->password = $password;
if($dsn)
{
$this->pdo = new PDO($this->dsn,$this->user,$this->password);
}
}
public function fork()
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
echo "<<<<< OK. Start Fork 36kr >>>>>\n";
for($i=$this->start;$i<=$this->end;$i++)
{
$url = "http://www.36kr.com/p/".$i.".html";
curl_setopt($ch, CURLOPT_URL, $url);
$page = curl_exec($ch);
if(curl_getinfo($ch)['http_code']==200)
{
$t = preg_match('#<h1 class="entry-title sep10">.*</h1>#', $page, $title);
$c = preg_match('#<p class="mainContent sep-10">.*</p>#Us', $page, $content);
if($t&&$c)
{
$title = strip_tags($title[0]);
$content = strip_tags($content[0]);
//$content = strip_tags($content[0],'<p><a>'); //保留<p>和<a>标记
echo $url.','.$title."\n";
$this->number++;
}
if($this->dsn)
{
$this->save($title,$content,$url);
}
}
}
echo '<<<< Fork Over! Total: '.$this->number.' >>>>';
}
private function save($title,$content,$url)
{
$sql = "INSERT INTO `36kr` (`id`,`title`,`content`,`url`) VALUES (null,:title,:content,:url)";
$stmt = $this->pdo->prepare($sql);
$stmt->bindParam(':title',$title);
$stmt->bindParam(':content',$content);
$stmt->bindParam(':url',$url);
$stmt->execute();
}
}
$dsn = 'mysql:host=localhost;dbname=test';
$user = 'root';
$password= 'root';
$kr = new Fork36kr(200100,206670,$dsn,$user,$password);
$kr->fork();[文件] phpcn.sql
-- phpMyAdmin SQL Dump -- version 4.0.5 -- http://www.php.cn/ -- -- 主机: localhost -- 生成日期: 2013 �?10 �?03 �?00:36 -- 服务器版本: 5.6.12-log -- PHP 版本: 5.5.3 SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; SET time_zone = "+00:00"; /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; /*!40101 SET NAMES utf8 */; -- -- 数据库: `test` -- -- -------------------------------------------------------- -- -- 表的结构 `36kr` -- CREATE TABLE IF NOT EXISTS `36kr` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(128) NOT NULL, `content` text NOT NULL, `url` varchar(128) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ; /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
我愿意把本文归入我的“编程糗事”系列。尽管在正规大学课程中,接触到软件工程、企业级软件架构和数据库设计,但我还是时不时地体会到下述事实带给我的“罪恶”感,当然,都是我的主观感受,并且面向Eclipse: 你是PHP菜鸟,如果你: 1. 不会利用如phpDoc这样的工具来恰当地注释你的代码 2. 对优秀的集成开发环境如Zend Studio或Eclipse PDT视而不见 3
379
立即学习“PHP免费学习笔记(深入)”;
PHP怎么学习?PHP怎么入门?PHP在哪学?PHP怎么学才快?不用担心,这里为大家提供了PHP速学教程(入门到精通),有需要的小伙伴保存下载就能学习啦!
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号