c#批量抓取免费代理并且验证有效性的实战教程

2020-01-05 09:12:03刘景俊

使用Timer定时抓取,并检查,成功则保存到redis

c#有三种定时器,这里定时器是使用System.Threading命名空间, 这个Timer会开启新的线程,抓取三个网页定义了三个Timer对象。每一次抓取都会保存上一次抓取的集合,检查前,会进行对比,取出新的集合也就是没有重复的那部分。有效性的ip比较低,这里没有做统计,如果代码再优化一下,可以做一下统计,看看程序的主入口吧,最终的实现如下:


class Program
 {
  static bool timer_ip3366_isCompleted = true;
  static bool timer_xicidaili_isCompleted = true;
  static bool timer_66ip_isCompleted = true;
  static Timer timer_ip3366, timer_xicidaili, timer_66ip;
  private static List<string> lastListip3366,lastList66ip,lastListxicidaili;//保存上一次抓取的代理,与下一次进行对比,取新的集合进行检查筛选
  static async Task Main(string[] args)
  {
   System.Net.ServicePointManager.DefaultConnectionLimit = 2000;
   Console.WriteLine("hellow proxyIp");
   Console.ReadLine();
   lastList66ip = new List<string>();
   lastListip3366 = new List<string>();
   lastListxicidaili = new List<string>();
   timer_ip3366 = new Timer(async (state) =>
   {
    await TimerIp3366Async();
   }, "processing timer_ip3366 event", 0,1000*30);
   timer_xicidaili = new Timer(async (state) =>
   {
    await TimerXicidailiAsync();
   }, "processing timer_xicidaili event", 0, 1000 * 60);
   timer_66ip = new Timer(async (state) =>
   {
    await Timer66ipAsync();
   }, "processing timer_66ip event", 0, 1000*30);
   
   Console.ReadLine();
  }



  private static async Task Timer66ipAsync()
  {
   if (timer_66ip_isCompleted)
   {
    timer_66ip_isCompleted = false;
    List<string> checkList = new List<string>();
    var listProxyIp = ProxyIpHelper.Get66ipProxy();

    if (listProxyIp.Count > 0)
    {
     Console.ForegroundColor = ConsoleColor.DarkCyan;
     Console.WriteLine("66ip.cn 抓取到" + listProxyIp.Count + "条记录,正在对比.........");
     listProxyIp.ForEach(f =>
     {
      if (!lastList66ip.Contains(f))
      {
       checkList.Add(f);
      }
     });
     lastList66ip = listProxyIp;
     if (checkList.Count > 0)
     {
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("66ip.cn 需要检查" + checkList.Count + "条记录,正在进行检测是否有效..........");
      for (int i = 0; i < checkList.Count; i++)
      {
       string ipAddress = checkList[i];
       await ProxyIpHelper.CheckProxyIpAsync(ipAddress, () =>
       {
        bool insertSuccess = RedisHelper.InsertSet(ipAddress);
        Console.ForegroundColor = ConsoleColor.White;
        Console.WriteLine("66ip.cn");
        if (insertSuccess)
        {
         Console.WriteLine("success" + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
        }
        Console.WriteLine("重复插入" + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
       }, (error) =>
       {
        Console.ForegroundColor = ConsoleColor.Green;
        Console.WriteLine("66ip.cn");
        Console.WriteLine("error:" + ipAddress + error + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
       });
      }
      timer_66ip_isCompleted = true;
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("66ip.cn" + checkList.Count + "条记录,已经检测完成,正在进行下一次检查");
     }
     else
     {
      timer_66ip_isCompleted = true;
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("66ip.cn没有需要检查的代理ip");
     }
    }
    else
    {
     timer_66ip_isCompleted = true;
     Console.ForegroundColor = ConsoleColor.DarkCyan;
     Console.WriteLine("66ip.cn没有获取到代理ip");
    }
   }
  }

  private static async Task TimerXicidailiAsync()
  {
   if (timer_xicidaili_isCompleted)
   {
    //取出需要检查的ip地址,第一次100条则checklist就是100条记录,
    //第二次的100条中只有10是和上一次的不重复,则第二次只需要检查这10条记录
    timer_xicidaili_isCompleted = false;
    List<string> checkList = new List<string>();
    var listProxyIp = ProxyIpHelper.GetXicidailiProxy(1);
    if (listProxyIp.Count > 0)
    {
     Console.WriteLine("xicidaili.com 抓取到" + listProxyIp.Count + "条记录,正在对比............");
     listProxyIp.ForEach(f =>
     {
      if (!lastListxicidaili.Contains(f))
      {
       checkList.Add(f);
      }
     });
     lastListxicidaili = listProxyIp;
     if (checkList.Count > 0)
     {
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("xicidaili.com 需要检查" + checkList.Count + "条记录,正在进行检测是否有效..........");
      for (int i = 0; i < checkList.Count; i++)
      {
       string ipAddress = checkList[i];
       await ProxyIpHelper.CheckProxyIpAsync(ipAddress, () =>
       {
        bool insertSuccess = RedisHelper.InsertSet(ipAddress);
        Console.ForegroundColor = ConsoleColor.White;
        Console.WriteLine("xicidaili.com");
        if (insertSuccess)
        {
         Console.WriteLine("success" + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
        }
        else
         Console.WriteLine("重复插入" + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
       }, (error) =>
       {
        Console.WriteLine("xicidaili.com");
        Console.ForegroundColor = ConsoleColor.Red;
        Console.WriteLine("error:" + ipAddress + error + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
       });
      }
      timer_xicidaili_isCompleted = true;
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("xicidaili.com" + checkList.Count + "条记录,已经检测完成,正在进行下一次检查");
     }
     else
     {
      timer_xicidaili_isCompleted = true;
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("xicidaili.com没有需要检查的代理ip");
     }
    }
    else
    {
     timer_xicidaili_isCompleted = true;
     Console.ForegroundColor = ConsoleColor.DarkCyan;
     Console.WriteLine("xicidaili.com没有获取到代理ip");
    }
   }
  }
  private static async Task TimerIp3366Async()
  {
   if (timer_ip3366_isCompleted)
   {
    timer_ip3366_isCompleted = false;
    List<string> checkList = new List<string>();
    var listProxyIp = ProxyIpHelper.GetIp3366Proxy(4);
    if (listProxyIp.Count > 0)
    {
     Console.ForegroundColor = ConsoleColor.DarkCyan;
     Console.WriteLine("ip3366.net 抓取到" + listProxyIp.Count + "条记录,正在进行检测是否有效..........");
     listProxyIp.ForEach(f =>
     {
      if (!lastListip3366.Contains(f))
      {
       checkList.Add(f);
      }
     });
     lastListip3366 = listProxyIp;
     if (checkList.Count != 0)
     {
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("ip3366.net 需要检查" + checkList.Count + "条记录,正在进行检测是否有效..........");
      for (int i = 0; i < checkList.Count; i++)
      {
       string ipAddress = checkList[i];
       await ProxyIpHelper.CheckProxyIpAsync(ipAddress, () =>
       {
        bool insertSuccess = RedisHelper.InsertSet(ipAddress);
        Console.ForegroundColor = ConsoleColor.White;
        Console.WriteLine("ip3366.net");
        if (insertSuccess)
        {
         Console.WriteLine("success" + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
        }
        else
        {
         Console.ForegroundColor = ConsoleColor.Red;
         Console.WriteLine("重复插入" + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
        }
       }, (error) =>
       {
        Console.ForegroundColor = ConsoleColor.Yellow;
        Console.WriteLine("ip3366.net");
        Console.WriteLine("error " + ipAddress + "任务编号:" + i + "当前任务线程:" + Thread.CurrentThread.ManagedThreadId);
       });
      }
      timer_ip3366_isCompleted = true;
      Console.WriteLine("ip3366.net" + checkList.Count + "条记录,已经检测完成,正在进行下一次检查");
     }
     else
     {
      timer_ip3366_isCompleted = true;
      Console.ForegroundColor = ConsoleColor.DarkCyan;
      Console.WriteLine("ip3366.net没有需要检查的代理ip");
     }
    }
    else
    {
     timer_ip3366_isCompleted = true;
     Console.ForegroundColor = ConsoleColor.DarkCyan;
     Console.WriteLine("ip3366.net没有获取到代理ip");
    }

   }
  }
 }