php 采集书并合成txt格式的实现代码
author:一佰互联 2019-05-01   click:180
<?php
/**
* @name 采集书.php
* @date Sun Mar 01 22:48:02 CST 2009
* @copyright 马永占(MyZ)
* @author 马永占(MyZ)
* @link http://blog.csdn.net/mayongzhan/
*/
//header("Content-Type:text/html;charset=utf8");
header("Content-Type:text/html;charset=gb2312");
error_reporting(E_ALL);
date_default_timezone_set("Asia/Shanghai");
set_time_limit(0);
function writer($content,$url)
{
$fp = fopen($url, "ab");
fwrite($fp, $content);
fclose($fp);
}
$folder = "2"; //文件夹
$book_base_url = "xxxxxxxxxxxxxxxxxxxxx";
$book_url = "yyyyyyyyyyyyy.html";
$main = file_get_contents($book_base_url.$book_url);
preg_match_all("/chapter_.*?.html/", $main, $pages);
$pages = array_unique($pages[0]);
foreach ($pages as $value) {
writer(file_get_contents($book_base_url.$value), "./".$folder."/".$value.".txt");
$str = file_get_contents("./".$folder."/".$value.".txt");
//print_r($str);
preg_match("/(<h1>)(.*?)(</h1>)(.*?)(<div id="contTxt" class="contTxt1">)(.*?)(</div>)/s",$str,$arr);
//print_r($arr);die();
$arr[6] = preg_replace("/(<span[^>]+>.*?<a[^>]+>)(.*?)(</a></span>)/s","$2",preg_replace("/<p>|</p>/"," ",$arr[6]));
$result = " ------------------------------------------------ ------------------------------------------------ ------------------------------------------------ ----------------".$arr[2]." ------------------------------------------------ ------------------------------------------------ ------------------------------------------------ ".$arr[6];
writer($result, "./".$folder."/new.txt");
}
?>