C#制作多线程处理强化版网络爬虫

2019-12-30 14:07:05王冬梅
正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"<imgb[^<>]*?bsrc[strn]*=[strn]*[""']?[strn]*(?<imgUrl>[^strn""'<>]*)[^<>]*?/?[strn]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(html); List<string> sUrlList = new List<string>(); // 取得匹配项列表 foreach (Match match in matches) sUrlList.Add(match.Groups["imgUrl"].Value); return sUrlList; } /// <summary> /// 提取页面链接 /// </summary> /// <param name="html"></param> /// <returns></returns> public static List<string> GetHttpLinks(string url) { //获取网址内容 string html = HttpHelper.HtmlCodeRequest(url); if (string.IsNullOrEmpty(html)) { return new List<string>(); } //匹配http链接 const string pattern2 = @"http(s)?://([w-]+.)+[w-]+(/[w- ./?%&=]*)?"; Regex r2 = new Regex(pattern2, RegexOptions.IgnoreCase); //获得匹配结果 MatchCollection m2 = r2.Matches(html); List<string> links = new List<string>(); foreach (Match url2 in m2) { if (StringHelper.CheckUrlIsLegal(url2.ToString()) || !StringHelper.IsPureUrl(url2.ToString()) || links.Contains(url2.ToString())) continue; links.Add(url2.ToString()); } //匹配href里面的链接 const string pattern = @"(?i)<as[^>]*?href=(['""]?)(?!javascript|__doPostBack)(?<url>[^'""s*#<>]+)[^>]*>"; ; Regex r = new Regex(pattern, RegexOptions.IgnoreCase); //获得匹配结果 MatchCollection m = r.Matches(html); foreach (Match url1 in m) { string href1 = url1.Groups["url"].Value; if (!href1.Contains("http")) { href1 = Global.WebUrl + href1; } if (!StringHelper.IsPureUrl(href1) || links.Contains(href1)) continue; links.Add(href1); } return links; }

这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托


public string DownLoadimg(string url)
    {
      if (!string.IsNullOrEmpty(url))
      {
        try
        {
          if (!url.Contains("http"))
          {
            url = Global.WebUrl + url;
          }
          HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
          request.Timeout = 2000;
          request.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
          //是否允许302
          request.AllowAutoRedirect = true;
          WebResponse response = request.GetResponse();
          Stream reader = response.GetResponseStream();
          //文件名
          string aFirstName = Guid.NewGuid().ToString();
          //扩展名
          string aLastName = url.Substring(url.LastIndexOf(".") + 1, (url.Length - url.LastIndexOf(".") - 1));
          FileStream writer = new FileStream(Global.FloderUrl + aFirstName + "." + aLastName, FileMode.OpenOrCreate, FileAccess.Write);
          byte[] buff = new byte[512];
          //实际读取的字节数
          int c = 0;
          while ((c = reader.Read(buff, 0, buff.Length)) > 0)
          {
            writer.Write(buff, 0, c);
          }
          writer.Close();
          writer.Dispose();
          reader.Close();
          reader.Dispose();
          response.Close();
          return (aFirstName + "." + aLastName);
        }
        catch (Exception)
        {
          return "错误:地址" + url;
        }
      }
      return "错误:地址为空";
    }