之前一直在关注句子迷这个网站,在里面可以找到自己喜欢的名人的金典语录。
每一次都去打开,再去翻页查看太烦,有没有什么方法可以把网站中的句子直接抓出来,保存到本地txt文件中?
然后在网上查看了一些资料,自己动手写了一个控制台应用程序。
注:代码存在点问题,多次发起请求,线程会被占用,目前没有解决;求救中
具体实现方式:
步骤1:通过HttpWebResponse发送一个请求,将整个HTML页面请求过来,将全部数据读入StreamReader中,下图HTML代码
步骤2:将读入流中的数据进行处理,只取包含句子的文本,这里面需要查询页面的中html。
在取文本的过程中,使用正则表达式,取出想要的数据
///正则表达式
Regex regText = new Regex(@"<div\s+class\=\""views-field-PHPcode-1\"">([\S\s]*?)</div>",RegexOptions.IgnoreCase);
Regex objRegExp = new Regex("<(.|\n)+?>");
具体代码如下:
class Program
{
static ReaderWriterLock writeLock = new ReaderWriterLock();
const int LOCK = 1000; //申请读写时间
const int SLEEP = 100; //线程挂起时间
static void Main(string[] args)
{
Console.WriteLine("-------------------句子迷文档下载----------");
Console.WriteLine("操作API:");
Console.WriteLine("注释1:查询的的作者名,以逗号(英文)隔开,例子如下:");
Console.WriteLine(" 鲁迅,胡适,顾城");
Console.WriteLine("注释2:保存的盘符,例子如下:");
Console.WriteLine(" X:\\\\句子迷 ");
Console.WriteLine("-------------------文档结束----------");
Console.WriteLine("请输入需要保存的盘:");
directoryName1 = Console.ReadLine();
Console.WriteLine("请输入作者姓名:");
string writers = Console.ReadLine();
string[] strWriter = writers.Split(',');
//string[] strWriter = { "鲁迅","胡适","belle","stars" };
for (int i = 0; i < strWriter.Length; i++)
{
Thread thread = new Thread(DownLoad1);
thread.Name = strWriter[i];
thread.Start();
}
Console.ReadKey();
}
#region 通过网页获取句子迷中的名言名句
/// <summary>
/// 通过网页获取句子迷中的名言名句
/// </summary>
/// <param name="category"></param>
public static void DownLoad1()
{
string url = string.Empty;
bool flag = true;//遇到取文件异常就直接跳出
for (int pageSize = 0; pageSize < 10; pageSize++)
{
try
{
string writerName = Thread.CurrentThread.Name; ;
//查询作者列子
//if (pageSize == 0)
// url = "http://www.juzimi.com/writer/徐志摩";
//else
// url = "http://www.juzimi.com/writer/徐志摩?page=" + pageSize;
//第一版查询功能
//if (pageSize == 0)
// url = "http://www.juzimi.com/writer/"+writerName;
//else
// url = "http://www.juzimi.com/writer/" + writerName + "?page=" + pageSize;
if (pageSize == 0)
url = "http://www.juzimi.com/search/node/" + writerName + "%20type:sentence";
else
url = "http://www.juzimi.com/search/node/" + writerName + "%20type%3Asentence?page=" + pageSize;
//创建http链接
var request = (HttpWebRequest)WebRequest.Create(url);
//request.Timeout = 1000 * 10; //5s过期
var response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader sr = new StreamReader(stream);
string content = sr.ReadToEnd();
var list = GetHtmlTextList(content);
if (list.Count == 0)
{
Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + ":未找到相关信息;" + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
flag = false;
break;
}
try
{
//文件保存文件夹
string directoryName = string.IsNullOrEmpty(directoryName1) ? "X:\\句子迷" : directoryName1;
//文件名
string fileName = writerName;
Write(directoryName,fileName,list);
Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + "句子信息下载完成!" + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
}
catch (Exception e)
{
Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + " 错误信息:" + e.Message + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
continue;
}
}
catch (Exception ex)
{
if (ex.ToString().Contains("404"))
{
Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + " 错误信息:" + ex.Message + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
flag = false;
break;
}
else
{
Console.WriteLine("时间:" + DateTime.Now + " 当前网址:" + url + " 错误信息:" + ex.Message + "当前线程:" + Thread.CurrentThread.ManagedThreadId);
}
}
if (!flag)
{
break;
}
}
}
/// <summary>
/// 保存句子迷中的句子
/// </summary>
/// <param name="path">保存路劲地址</param>
/// /// <param name="path">句子迷中的句子</param>
public static void Write(string path,string fileName,List<string> strBook)
{
writeLock.AcquireWriterLock(LOCK);
FileStream fs = null;
//判断文件夹是否存在
if (!Directory.Exists(path))
Directory.CreateDirectory(path);
//判断集体文件保存路径是否存在
string filePalce = path + "\\" + fileName + ".txt";
if (!File.Exists(filePalce))
{
File.Create(filePalce);
fs = new FileStream(filePalce,FileMode.Create);
}
else
{
fs = new FileStream(filePalce,FileMode.Append);
}
StreamWriter sw = new StreamWriter(fs);
foreach (var item in strBook)
{
//开始写入
sw.Write(item + "\r\n\r\n");
}
Thread.Sleep(SLEEP);
//清空缓冲区
sw.Flush();
//关闭流
sw.Close();
fs.Close();
writeLock.ReleaseWriterLock();
}
/// <summary>
/// 取得HTML中所有图片的 URL。
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <returns>图片的URL列表</returns>
public static List<string> GetHtmlTextList(string sHtmlText)
{
// 定义正则表达式用来匹配 text 标签
Regex regText = new Regex(@"<div\s+class\=\""views-field-PHPcode-1\"">([\S\s]*?)</div>",RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regText.Matches(sHtmlText);
List<string> sUrlList = new List<string>();
// 取得匹配项列表
foreach (Match match in matches)
{
sUrlList.Add(replceHtml(match.Value));
}
return sUrlList;
}
/// <summary>
/// 将取出来的含有html的标签替换掉,只留下里面的值
/// </summary>
/// <param name="strHtml"></param>
/// <returns></returns>
public static string replceHtml(string strHtml)
{
Regex objRegExp = new Regex("<(.|\n)+?>");
return objRegExp.Replace(strHtml,"");
}
#endregion