这块的技术其实就是简单的使用了正则去匹配!接下来献上获取标题,以及存储到xml文件的方法
/// <summary>
/// // 把网址写入xml文件
/// </summary>
/// <param name="strURL"></param>
/// <param name="alHyperLinks"></param>
private static void WriteToXml(string strURL, List<string> alHyperLinks)
{
XmlTextWriter writer = new XmlTextWriter(@"D:HyperLinks.xml", Encoding.UTF8);
writer.Formatting = Formatting.Indented;
writer.WriteStartDocument(false);
writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
writer.WriteComment("提取自" + strURL + "的超链接");
writer.WriteStartElement("HyperLinks");
writer.WriteStartElement("HyperLinks", null);
writer.WriteAttributeString("DateTime", DateTime.Now.ToString());
foreach (string str in alHyperLinks)
{
string string body = str;
writer.WriteElementString(title, null, body);
}
writer.WriteEndElement();
writer.WriteEndElement();
writer.Flush();
writer.Close();
}
/// <summary>
/// 获取网址的域名后缀
/// </summary>
/// <param name="strURL"></param>
/// <returns></returns>
private static string GetDomain(string strURL)
{
string retVal;
string strRegex = @"(.com/|.net/|.cn/|.org/|.gov/)";
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
Match m = r.Match(strURL);
retVal = m.ToString();
strRegex = @".|/$";
retVal = Regex.Replace(retVal, strRegex, "").ToString();
if (retVal == "")
retVal = "other";
return retVal;
}
/// <summary>
/// 获取标题
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private static string GetTitle(string html)
{
string titleFilter = @"<title>[sS]*?</title>";
string h1Filter = @"<h1.*?>.*?</h1>";
string clearFilter = @"<.*?>";
string Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase);
if (match.Success)
{ }
// 正文的标题一般在h1中,比title中的标题更干净
match = Regex.Match(html, h1Filter, RegexOptions.IgnoreCase);
if (match.Success)
{
string h1 = Regex.Replace(match.Groups[0].Value, clearFilter, "");
if (!String.IsNullOrEmpty(h1) && title.StartsWith(h1))
{ }
}
return title;
}
这就是所用的全部方法,还是有很多需要改进之处!大家如果有发现不足之处还请指出,谢谢!










