class="hljs-ln-code"> class="hljs-ln-line">using System.Collections.Generic;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line">using System.Text;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line">using System.Collections;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line">using xu_common.log;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line">namespace data_collection.spider_core
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line">{
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line"> class UrlSet
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="11"> class="hljs-ln-code"> class="hljs-ln-line"> public UrlSet()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="12"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="13"> class="hljs-ln-code"> class="hljs-ln-line"> // 定义三个队列,准备抓取的URL集合,已经抓取的URL集合,错误的URL集合
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="14"> class="hljs-ln-code"> class="hljs-ln-line"> // 这里需要注意,因为我是定向抓取,这里URL不会太多。如果是全网抓取,URL太多,要防止内存泄露
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="15"> class="hljs-ln-code"> class="hljs-ln-line"> _going_to_parse = new ArrayList();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="16"> class="hljs-ln-code"> class="hljs-ln-line"> _already_parse = new ArrayList();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="17"> class="hljs-ln-code"> class="hljs-ln-line"> _error_link = new ArrayList();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="18"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="19"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="20"> class="hljs-ln-code"> class="hljs-ln-line"> private static UrlSet __instance = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="21"> class="hljs-ln-code"> class="hljs-ln-line"> public static UrlSet instance
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="22"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="23"> class="hljs-ln-code"> class="hljs-ln-line"> get
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="24"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="25"> class="hljs-ln-code"> class="hljs-ln-line"> if (__instance == null)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="26"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="27"> class="hljs-ln-code"> class="hljs-ln-line"> __instance = new UrlSet();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="28"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="29"> class="hljs-ln-code"> class="hljs-ln-line"> return __instance;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="30"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="31"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="32"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="33"> class="hljs-ln-code"> class="hljs-ln-line"> private ArrayList _going_to_parse = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="34"> class="hljs-ln-code"> class="hljs-ln-line"> private ArrayList _already_parse = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="35"> class="hljs-ln-code"> class="hljs-ln-line"> private ArrayList _error_link = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="36"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="37"> class="hljs-ln-code"> class="hljs-ln-line"> // 判断URL是否抓取过,并根据参数add是否true来判断这个URL是否入库
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="38"> class="hljs-ln-code"> class="hljs-ln-line"> private bool is_url_parsed(string url, bool add)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="39"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="40"> class="hljs-ln-code"> class="hljs-ln-line"> bool rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="41"> class="hljs-ln-code"> class="hljs-ln-line"> lock (_already_parse.SyncRoot)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="42"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="43"> class="hljs-ln-code"> class="hljs-ln-line"> rv = _already_parse.Contains(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="44"> class="hljs-ln-code"> class="hljs-ln-line"> if (!rv && add)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="45"> class="hljs-ln-code"> class="hljs-ln-line"> _already_parse.Add(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="46"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="47"> class="hljs-ln-code"> class="hljs-ln-line"> return rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="48"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="49"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="50"> class="hljs-ln-code"> class="hljs-ln-line"> // 判断URL是否抓取过
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="51"> class="hljs-ln-code"> class="hljs-ln-line"> private bool is_url_parsed(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="52"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="53"> class="hljs-ln-code"> class="hljs-ln-line"> return is_url_parsed(url, false);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="54"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="55"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="56"> class="hljs-ln-code"> class="hljs-ln-line"> // 判断URL是否在待抓取列表,并根据参数add是否true来判断这个URL是否加入待抓取
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="57"> class="hljs-ln-code"> class="hljs-ln-line"> private bool is_url_going_to_parse(string url, bool add)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="58"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="59"> class="hljs-ln-code"> class="hljs-ln-line"> bool rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="60"> class="hljs-ln-code"> class="hljs-ln-line"> lock (_going_to_parse.SyncRoot)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="61"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="62"> class="hljs-ln-code"> class="hljs-ln-line"> rv = _going_to_parse.Contains(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="63"> class="hljs-ln-code"> class="hljs-ln-line"> if (!rv && add)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="64"> class="hljs-ln-code"> class="hljs-ln-line"> _going_to_parse.Add(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="65"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="66"> class="hljs-ln-code"> class="hljs-ln-line"> return rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="67"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="68"> class="hljs-ln-code"> class="hljs-ln-line"> // 判断URL是否在待抓取列表
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="69"> class="hljs-ln-code"> class="hljs-ln-line"> private bool is_url_going_to_parse(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="70"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="71"> class="hljs-ln-code"> class="hljs-ln-line"> return is_url_going_to_parse(url, false);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="72"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="73"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="74"> class="hljs-ln-code"> class="hljs-ln-line"> // 判断URL是否在错误URL列表,并根据add来确定是否要加入此列表
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="75"> class="hljs-ln-code"> class="hljs-ln-line"> private bool is_url_error_lnk(string url, bool add)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="76"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="77"> class="hljs-ln-code"> class="hljs-ln-line"> bool rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="78"> class="hljs-ln-code"> class="hljs-ln-line"> lock (_error_link.SyncRoot)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="79"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="80"> class="hljs-ln-code"> class="hljs-ln-line"> rv = _error_link.Contains(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="81"> class="hljs-ln-code"> class="hljs-ln-line"> if (!rv && add)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="82"> class="hljs-ln-code"> class="hljs-ln-line"> _already_parse.Add(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="83"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="84"> class="hljs-ln-code"> class="hljs-ln-line"> return rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="85"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="86"> class="hljs-ln-code"> class="hljs-ln-line"> private bool is_url_error_lnk(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="87"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="88"> class="hljs-ln-code"> class="hljs-ln-line"> return is_url_error_lnk(url, false);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="89"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="90"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="91"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="92"> class="hljs-ln-code"> class="hljs-ln-line"> /// 把一个Url加到待解析列表中.
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="93"> class="hljs-ln-code"> class="hljs-ln-line"> /// 如果已经解析过,返回-1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="94"> class="hljs-ln-code"> class="hljs-ln-line"> /// 如果是坏链,返回-2
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="95"> class="hljs-ln-code"> class="hljs-ln-line"> /// 如果已经在待解析列表中,返回1.
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="96"> class="hljs-ln-code"> class="hljs-ln-line"> /// 否则加入待解析列表,并且返回0
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="97"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="98"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="99"> class="hljs-ln-code"> class="hljs-ln-line"> /// >=0:OK, <0:ERROR
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="100"> class="hljs-ln-code"> class="hljs-ln-line"> public int add_going_parse_url(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="101"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="102"> class="hljs-ln-code"> class="hljs-ln-line"> lock (_going_to_parse.SyncRoot)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="103"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="104"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_parsed(url))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="105"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="106"> class="hljs-ln-code"> class="hljs-ln-line"> return -1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="107"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="108"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_error_lnk(url))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="109"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="110"> class="hljs-ln-code"> class="hljs-ln-line"> return -2;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="111"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="112"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_going_to_parse(url, true))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="113"> class="hljs-ln-code"> class="hljs-ln-line"> return 1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="114"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="115"> class="hljs-ln-code"> class="hljs-ln-line"> //_going_to_parse.Add(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="116"> class="hljs-ln-code"> class="hljs-ln-line"> return 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="117"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="118"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="119"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="120"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="121"> class="hljs-ln-code"> class="hljs-ln-line"> /// 添加一个已经抓取过的链接,如果此链接在待抓取或者坏链中,删除
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="122"> class="hljs-ln-code"> class="hljs-ln-line"> /// 如果已经在抓取过列表中,返回-1,否则返回0
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="123"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="124"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="125"> class="hljs-ln-code"> class="hljs-ln-line"> /// 0:OK,-1:ERROR
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="126"> class="hljs-ln-code"> class="hljs-ln-line"> public int add_parsed_url(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="127"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="128"> class="hljs-ln-code"> class="hljs-ln-line"> // already parse, not use to parse again.
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="129"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_going_to_parse(url))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="130"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="131"> class="hljs-ln-code"> class="hljs-ln-line"> _going_to_parse.Remove(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="132"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="133"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_error_lnk(url))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="134"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="135"> class="hljs-ln-code"> class="hljs-ln-line"> _error_link.Remove(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="136"> class="hljs-ln-code"> class="hljs-ln-line"> //return -1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="137"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="138"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_parsed(url, true))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="139"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="140"> class="hljs-ln-code"> class="hljs-ln-line"> return -1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="141"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="142"> class="hljs-ln-code"> class="hljs-ln-line"> //_already_parse.Add(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="143"> class="hljs-ln-code"> class="hljs-ln-line"> return 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="144"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="145"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="146"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="147"> class="hljs-ln-code"> class="hljs-ln-line"> /// 添加一个错误的链接.如果该链接在待抓取列表中,删除(说明不应该抓取)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="148"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="149"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="150"> class="hljs-ln-code"> class="hljs-ln-line"> /// 0:OK; -1:ERROR
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="151"> class="hljs-ln-code"> class="hljs-ln-line"> public int add_error_url(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="152"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="153"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_going_to_parse(url))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="154"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="155"> class="hljs-ln-code"> class="hljs-ln-line"> _going_to_parse.Remove(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="156"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="157"> class="hljs-ln-code"> class="hljs-ln-line"> /*
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="158"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_parsed(url))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="159"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="160"> class="hljs-ln-code"> class="hljs-ln-line"> return -2;//都已经解析过了,还加进这里去干嘛? never go to here,因为解析过的话,不可能再拿来解析,然后到这里的
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="161"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="162"> class="hljs-ln-code"> class="hljs-ln-line"> * */
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="163"> class="hljs-ln-code"> class="hljs-ln-line"> if (is_url_error_lnk(url, true))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="164"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="165"> class="hljs-ln-code"> class="hljs-ln-line"> return -1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="166"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="167"> class="hljs-ln-code"> class="hljs-ln-line"> //_error_link.Add(url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="168"> class="hljs-ln-code"> class="hljs-ln-line"> return 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="169"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="170"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="171"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="172"> class="hljs-ln-code"> class="hljs-ln-line"> /// 把代解析的第一个节点抓下来,成功,则url有值,并返回true, 不然返回错误
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="173"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="174"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="175"> class="hljs-ln-code"> class="hljs-ln-line"> ///
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="176"> class="hljs-ln-code"> class="hljs-ln-line"> public bool pop_going_to_parse_url(ref string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="177"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="178"> class="hljs-ln-code"> class="hljs-ln-line"> url = "";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="179"> class="hljs-ln-code"> class="hljs-ln-line"> bool rv = false;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="180"> class="hljs-ln-code"> class="hljs-ln-line"> lock (_going_to_parse.SyncRoot)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="181"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="182"> class="hljs-ln-code"> class="hljs-ln-line"> if (_going_to_parse.Count <= 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="183"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="184"> class="hljs-ln-code"> class="hljs-ln-line"> rv = false;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="185"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="186"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="187"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="188"> class="hljs-ln-code"> class="hljs-ln-line"> url = _going_to_parse[0].ToString();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="189"> class="hljs-ln-code"> class="hljs-ln-line"> _going_to_parse.RemoveAt(0);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="190"> class="hljs-ln-code"> class="hljs-ln-line"> rv = true;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="191"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="192"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="193"> class="hljs-ln-code"> class="hljs-ln-line"> return rv;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="194"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="195"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="196"> class="hljs-ln-code"> class="hljs-ln-line"> public int going_to_parse_url_num()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="197"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="198"> class="hljs-ln-code"> class="hljs-ln-line"> int ret = 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="199"> class="hljs-ln-code"> class="hljs-ln-line"> lock (_going_to_parse.SyncRoot)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="200"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="201"> class="hljs-ln-code"> class="hljs-ln-line"> ret = _going_to_parse.Count;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="202"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="203"> class="hljs-ln-code"> class="hljs-ln-line"> return ret;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="204"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="205"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="206"> class="hljs-ln-code"> class="hljs-ln-line"> private string[] _no_parse_keyword = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="207"> class="hljs-ln-code"> class="hljs-ln-line"> private int _no_parse_type = 3;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="208"> class="hljs-ln-code"> class="hljs-ln-line"> public void SetNoParseKeyWord(string str, string split, int type)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="209"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="210"> class="hljs-ln-code"> class="hljs-ln-line"> _no_parse_keyword = xu_common.CommonOperator.Split(str, split);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="211"> class="hljs-ln-code"> class="hljs-ln-line"> _no_parse_type = type;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="212"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="213"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="214"> class="hljs-ln-code"> class="hljs-ln-line"> public bool IsNoParse(string url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="215"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="216"> class="hljs-ln-code"> class="hljs-ln-line"> LogMsg.LogError(url + ", no_parse_type="+ _no_parse_type.ToString());
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="217"> class="hljs-ln-code"> class="hljs-ln-line"> if (_no_parse_type == 1)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="218"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="219"> class="hljs-ln-code"> class="hljs-ln-line"> for (int i = 0; i < _no_parse_keyword.Length; i++)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="220"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="221"> class="hljs-ln-code"> class="hljs-ln-line"> if (url.Contains(_no_parse_keyword[i]))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="222"> class="hljs-ln-code"> class="hljs-ln-line"> return false;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="223"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="224"> class="hljs-ln-code"> class="hljs-ln-line"> return true;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="225"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="226"> class="hljs-ln-code"> class="hljs-ln-line"> else if(_no_parse_type==2)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="227"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="228"> class="hljs-ln-code"> class="hljs-ln-line"> for (int i = 0; i < _no_parse_keyword.Length; i++)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="229"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="230"> class="hljs-ln-code"> class="hljs-ln-line"> if (url.Contains(_no_parse_keyword[i]))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="231"> class="hljs-ln-code"> class="hljs-ln-line"> return true;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="232"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="233"> class="hljs-ln-code"> class="hljs-ln-line"> return false;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="234"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="235"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="236"> class="hljs-ln-code"> class="hljs-ln-line"> return false;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="237"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="238"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="239"> class="hljs-ln-code"> class="hljs-ln-line"> #region write back to file: ToString
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="240"> class="hljs-ln-code"> class="hljs-ln-line"> public string GoingParseToString()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="241"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="242"> class="hljs-ln-code"> class="hljs-ln-line"> string ret = "";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="243"> class="hljs-ln-code"> class="hljs-ln-line"> int count = _going_to_parse.Count;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="244"> class="hljs-ln-code"> class="hljs-ln-line"> for (int i = 0; i < count-1; i++)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="245"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="246"> class="hljs-ln-code"> class="hljs-ln-line"> ret += _going_to_parse[i].ToString() + "\r\n";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="247"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="248"> class="hljs-ln-code"> class="hljs-ln-line"> if (count > 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="249"> class="hljs-ln-code"> class="hljs-ln-line"> ret += _going_to_parse[count - 1].ToString();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="250"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="251"> class="hljs-ln-code"> class="hljs-ln-line"> _going_to_parse.Clear();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="252"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="253"> class="hljs-ln-code"> class="hljs-ln-line"> return ret;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="254"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="255"> class="hljs-ln-code"> class="hljs-ln-line"> public string AlreadyParsedToString()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="256"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="257"> class="hljs-ln-code"> class="hljs-ln-line"> string ret = "";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="258"> class="hljs-ln-code"> class="hljs-ln-line"> int count = _already_parse.Count;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="259"> class="hljs-ln-code"> class="hljs-ln-line"> for (int i = 0; i < count - 1; i++)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="260"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="261"> class="hljs-ln-code"> class="hljs-ln-line"> ret += _already_parse[i].ToString() + "\r\n";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="262"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="263"> class="hljs-ln-code"> class="hljs-ln-line"> if (count > 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="264"> class="hljs-ln-code"> class="hljs-ln-line"> ret += _already_parse[count - 1].ToString();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="265"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="266"> class="hljs-ln-code"> class="hljs-ln-line"> _already_parse.Clear();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="267"> class="hljs-ln-code"> class="hljs-ln-line"> return ret;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="268"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="269"> class="hljs-ln-code"> class="hljs-ln-line"> public string ErrorUrlToString()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="270"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="271"> class="hljs-ln-code"> class="hljs-ln-line"> string ret = "";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="272"> class="hljs-ln-code"> class="hljs-ln-line"> int count = _error_link.Count;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="273"> class="hljs-ln-code"> class="hljs-ln-line"> for (int i = 0; i < count - 1; i++)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="274"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="275"> class="hljs-ln-code"> class="hljs-ln-line"> ret += _error_link[i].ToString() + "\r\n";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="276"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="277"> class="hljs-ln-code"> class="hljs-ln-line"> if (count > 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="278"> class="hljs-ln-code"> class="hljs-ln-line"> ret += _error_link[count - 1].ToString();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="279"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="280"> class="hljs-ln-code"> class="hljs-ln-line"> _error_link.Clear();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="281"> class="hljs-ln-code"> class="hljs-ln-line"> return ret;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="282"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="283"> class="hljs-ln-code"> class="hljs-ln-line"> #endregion
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="284"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="285"> class="hljs-ln-code"> class="hljs-ln-line">}
  • class="hide-preCode-box"> class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">

    二、下载和解析网页

    以下代码是一个线程抓取文件(非网页内容,如图片、文件)的主要代码:设置一个URL,抓取后把URL放到已抓取队列,并保存网页内容到文件中; 抓取失败,则把URL放到错误队列。

    有些注释掉的 MessageBox是博主当时调试用的,大家可忽略。

    1. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line">namespace data_collection.spider_core
    2. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">{
    3. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line"> // 继承的Task类是博主写的公共类,这里不是关键代码,不再贴出。
    4. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line"> // 这里是一个子线程。需要设置抓取的URL和网页文件保存路径。为了加快抓取速度,需要启动多线程。
    5. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line"> class DownFileTask : xu_common.thread.Task
    6. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line"> {
    7. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line"> public override void Run()
    8. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line"> {
    9. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line"> FileStream fileStream = new FileStream(_filepath, FileMode.Append|FileMode.Create, FileAccess.Write);
    10. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line"> Stream inStream = null;
    11. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="11"> class="hljs-ln-code"> class="hljs-ln-line"> try
    12. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="12"> class="hljs-ln-code"> class="hljs-ln-line"> {
    13. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="13"> class="hljs-ln-code"> class="hljs-ln-line">
    14. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="14"> class="hljs-ln-code"> class="hljs-ln-line"> HttpWebRequest myre = (HttpWebRequest)WebRequest.Create(_url);
    15. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="15"> class="hljs-ln-code"> class="hljs-ln-line">
    16. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="16"> class="hljs-ln-code"> class="hljs-ln-line"> if (fileStream.Length == myre.ContentLength)
    17. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="17"> class="hljs-ln-code"> class="hljs-ln-line"> {
    18. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="18"> class="hljs-ln-code"> class="hljs-ln-line"> //MessageBox.Show("你已完成下载该程序了", "ok");
    19. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="19"> class="hljs-ln-code"> class="hljs-ln-line"> return;
    20. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="20"> class="hljs-ln-code"> class="hljs-ln-line"> }
    21. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="21"> class="hljs-ln-code"> class="hljs-ln-line">
    22. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="22"> class="hljs-ln-code"> class="hljs-ln-line"> myre.AddRange(Convert.ToInt32(fileStream.Length));//接上次下载的字节开始下载文件
    23. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="23"> class="hljs-ln-code"> class="hljs-ln-line"> HttpWebResponse response = (HttpWebResponse)myre.GetResponse();
    24. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="24"> class="hljs-ln-code"> class="hljs-ln-line"> inStream = response.GetResponseStream();
    25. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="25"> class="hljs-ln-code"> class="hljs-ln-line"> //this.progressBar1.Maximum = total;
    26. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="26"> class="hljs-ln-code"> class="hljs-ln-line"> //this.progressBar1.Minimum = 0;
    27. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="27"> class="hljs-ln-code"> class="hljs-ln-line"> int length = 1024;
    28. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="28"> class="hljs-ln-code"> class="hljs-ln-line"> byte[] buffer = new byte[1025];
    29. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="29"> class="hljs-ln-code"> class="hljs-ln-line"> int readerLength = 0, currentLength = 0;
    30. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="30"> class="hljs-ln-code"> class="hljs-ln-line"> while ((readerLength = inStream.Read(buffer, 0, length)) > 0)
    31. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="31"> class="hljs-ln-code"> class="hljs-ln-line"> {
    32. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="32"> class="hljs-ln-code"> class="hljs-ln-line"> currentLength += readerLength;
    33. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="33"> class="hljs-ln-code"> class="hljs-ln-line"> fileStream.Write(buffer, 0, readerLength);
    34. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="34"> class="hljs-ln-code"> class="hljs-ln-line"> //this.progressBar1.Value = currentLength + countByte;
    35. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="35"> class="hljs-ln-code"> class="hljs-ln-line">
    36. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="36"> class="hljs-ln-code"> class="hljs-ln-line"> fileStream.Flush();
    37. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="37"> class="hljs-ln-code"> class="hljs-ln-line"> }
    38. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="38"> class="hljs-ln-code"> class="hljs-ln-line">
    39. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="39"> class="hljs-ln-code"> class="hljs-ln-line"> fileStream.Close();
    40. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="40"> class="hljs-ln-code"> class="hljs-ln-line"> inStream.Close();
    41. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="41"> class="hljs-ln-code"> class="hljs-ln-line"> //File.Delete(Application.StartupPath + @"\FileLength.txt");
    42. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="42"> class="hljs-ln-code"> class="hljs-ln-line"> //MessageBox.Show("down 成功", "ok");
    43. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="43"> class="hljs-ln-code"> class="hljs-ln-line">
    44. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="44"> class="hljs-ln-code"> class="hljs-ln-line"> // 抓取成功,这个URL放入已抓取队列
    45. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="45"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_parsed_url(_url);
    46. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="46"> class="hljs-ln-code"> class="hljs-ln-line"> }
    47. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="47"> class="hljs-ln-code"> class="hljs-ln-line"> catch (Exception ex)
    48. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="48"> class="hljs-ln-code"> class="hljs-ln-line"> {
    49. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="49"> class="hljs-ln-code"> class="hljs-ln-line"> xu_common.log.LogMsg.LogError("down file:" + _url + ", error.msg:" + ex.ToString());
    50. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="50"> class="hljs-ln-code"> class="hljs-ln-line"> // 抓取失败,这个URL放入已失败队列
    51. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="51"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_error_url(_url);
    52. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="52"> class="hljs-ln-code"> class="hljs-ln-line"> }
    53. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="53"> class="hljs-ln-code"> class="hljs-ln-line"> }
    54. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="54"> class="hljs-ln-code"> class="hljs-ln-line">
    55. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="55"> class="hljs-ln-code"> class="hljs-ln-line"> public void SetUrl(string url) { _url = url; }
    56. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="56"> class="hljs-ln-code"> class="hljs-ln-line"> private string _url;
    57. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="57"> class="hljs-ln-code"> class="hljs-ln-line">
    58. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="58"> class="hljs-ln-code"> class="hljs-ln-line"> public void SetFilePath(string filepath) { _filepath = filepath; }
    59. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="59"> class="hljs-ln-code"> class="hljs-ln-line"> private string _filepath;
    60. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="60"> class="hljs-ln-code"> class="hljs-ln-line"> }
    61. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="61"> class="hljs-ln-code"> class="hljs-ln-line">}
    class="hide-preCode-box"> class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">

    以上是下载文件和图像的代码。
    ------------------------------------------

    以下代码是下载网页的核心代码。
    1.初始化代抓取的URL,设置浏览器代理(欺骗对应网站)。
    2.下载URL对应的网页内容。
    3.从URL的内容中解析结构化内容(根据规则),同时根据网页内容,解析到很多URL(网页的外链)。
    3.1 如果URL是需要下载的文件或图片,则新起一个线程,用上面的代码下载。
    3.2 如果URL是要继续抓取的网页URL,则放到待爬取链接。

    注(重要):此处,抓取到的内容(保存为_content)的解析,这里博主写了一个规则处理器。所以下面代码一句话带过。规则处理器比较简单,根据下载的网站不同。比如你想下载 http://xx.com/a/1.html,网站的内容,一般是通用的,我们就可以设置一个规则:只要满足URL为http://xx.com/a/*.html的,则按以下规则处理:跳过………………等代码,直到找到内容开始处,这里需要我们实际看一下http://xx.com/a/1.html的源码,一般是

    等特征开始。我们把这些内容跳过即可。

    1. class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1">
    class="hljs-ln-code"> class="hljs-ln-line">using System;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">using System.Collections.Generic;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line">using System.Text;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line">using System.Text.RegularExpressions;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line">using System.Net;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line">using xu_common.log;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line">using System.Collections;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line">using System.IO;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line">using System.IO.Compression;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line">using System.Security.Policy;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="11"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="12"> class="hljs-ln-code"> class="hljs-ln-line">namespace data_collection.spider_core
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="13"> class="hljs-ln-code"> class="hljs-ln-line">{
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="14"> class="hljs-ln-code"> class="hljs-ln-line"> class WebGetHtml
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="15"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="16"> class="hljs-ln-code"> class="hljs-ln-line"> // 初始化要抓取的网页URL
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="17"> class="hljs-ln-code"> class="hljs-ln-line"> public WebGetHtml(string parse_url)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="18"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="19"> class="hljs-ln-code"> class="hljs-ln-line"> _url = parse_url;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="20"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="21"> class="hljs-ln-code"> class="hljs-ln-line"> string _url = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="22"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="23"> class="hljs-ln-code"> class="hljs-ln-line"> string _tag_base_url = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="24"> class="hljs-ln-code"> class="hljs-ln-line"> string _title;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="25"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="26"> class="hljs-ln-code"> class="hljs-ln-line"> string _base_url = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="27"> class="hljs-ln-code"> class="hljs-ln-line"> string _base_top_url = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="28"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="29"> class="hljs-ln-code"> class="hljs-ln-line"> string _content;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="30"> class="hljs-ln-code"> class="hljs-ln-line"> private void GenBaseUrl()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="31"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="32"> class="hljs-ln-code"> class="hljs-ln-line"> if (_tag_base_url == null)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="33"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="34"> class="hljs-ln-code"> class="hljs-ln-line"> _base_top_url = global_var.RegTopUrl.Match(_url).Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="35"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="36"> class="hljs-ln-code"> class="hljs-ln-line"> string not_param_url = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="37"> class="hljs-ln-code"> class="hljs-ln-line"> int qPisition = this._url.IndexOf("?");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="38"> class="hljs-ln-code"> class="hljs-ln-line"> if (qPisition < 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="39"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="40"> class="hljs-ln-code"> class="hljs-ln-line"> not_param_url = _url;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="41"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="42"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="43"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="44"> class="hljs-ln-code"> class="hljs-ln-line"> not_param_url = _url.Substring(0, qPisition);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="45"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="46"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="47"> class="hljs-ln-code"> class="hljs-ln-line"> _base_url = Regex.Replace(not_param_url, "(?<=.*/)[^/]*$", "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="48"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="49"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="50"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="51"> class="hljs-ln-code"> class="hljs-ln-line"> _base_top_url = global_var.RegTopUrl.Match(_tag_base_url).Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="52"> class="hljs-ln-code"> class="hljs-ln-line"> _base_url = _tag_base_url;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="53"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="54"> class="hljs-ln-code"> class="hljs-ln-line"> if (_base_url.EndsWith("/"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="55"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="56"> class="hljs-ln-code"> class="hljs-ln-line"> _base_url = _base_url.Substring(0, _base_url.Length - 1);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="57"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="58"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="59"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="60"> class="hljs-ln-code"> class="hljs-ln-line"> public class UrlType
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="61"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="62"> class="hljs-ln-code"> class="hljs-ln-line"> public static int UrlTypeImg = 1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="63"> class="hljs-ln-code"> class="hljs-ln-line"> public static int UrlTypeFile = 2;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="64"> class="hljs-ln-code"> class="hljs-ln-line"> public static int UrlTypeHtml = 3;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="65"> class="hljs-ln-code"> class="hljs-ln-line"> public static int UrlTypeError = 4;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="66"> class="hljs-ln-code"> class="hljs-ln-line"> public static int UrlTypeSelf = 5;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="67"> class="hljs-ln-code"> class="hljs-ln-line"> public static int UrlTypeOtherFile = 6;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="68"> class="hljs-ln-code"> class="hljs-ln-line"> };
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="69"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="70"> class="hljs-ln-code"> class="hljs-ln-line"> // 检查URL类型
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="71"> class="hljs-ln-code"> class="hljs-ln-line"> private int CheckUrl(string UrltoCheck)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="72"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="73"> class="hljs-ln-code"> class="hljs-ln-line"> if (Regex.IsMatch(UrltoCheck, "^#*$", RegexOptions.IgnoreCase | RegexOptions.Compiled))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="74"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="75"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck == _url || (UrltoCheck + "/") == _url || UrltoCheck == (_url + "/"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="76"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeSelf;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="77"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".css"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="78"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeOtherFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="79"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".wmv"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="80"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="81"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".asf"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="82"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="83"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".mp3"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="84"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="85"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".avi"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="86"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="87"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".mpg"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="88"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="89"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".mpeg"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="90"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="91"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".rmvb"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="92"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="93"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".rm"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="94"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="95"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".doc"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="96"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="97"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".rar"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="98"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="99"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".zip"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="100"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="101"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".tar"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="102"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="103"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".xls"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="104"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="105"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".pdf"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="106"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeFile;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="107"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".jpg"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="108"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeImg;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="109"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".jpeg"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="110"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeImg;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="111"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".ico"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="112"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeImg;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="113"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".gif"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="114"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeImg;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="115"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".bmp"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="116"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeImg;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="117"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.EndsWith(".png"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="118"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeImg;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="119"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.StartsWith("ftp://"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="120"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="121"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.StartsWith("telnet://"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="122"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="123"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.StartsWith("mms://"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="124"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="125"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.StartsWith("rstp://"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="126"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="127"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.StartsWith("mailto"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="128"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="129"> class="hljs-ln-code"> class="hljs-ln-line"> else if (UrltoCheck.StartsWith("javascript"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="130"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeError;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="131"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="132"> class="hljs-ln-code"> class="hljs-ln-line"> return UrlType.UrlTypeHtml;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="133"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="134"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="135"> class="hljs-ln-code"> class="hljs-ln-line"> //确定URL是否属于要抓取的网站
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="136"> class="hljs-ln-code"> class="hljs-ln-line"> private bool CheckUrlThisSite(string NewUrltoCheck)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="137"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="138"> class="hljs-ln-code"> class="hljs-ln-line"> //return Form1.instance.CheckUrlToParse(NewUrltoCheck);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="139"> class="hljs-ln-code"> class="hljs-ln-line"> return global_var.Instance.IsInSite(NewUrltoCheck);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="140"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="141"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="142"> class="hljs-ln-code"> class="hljs-ln-line"> // 有些网页的链接不是以http开头,而是相对链接。需要处理这些不规则链接
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="143"> class="hljs-ln-code"> class="hljs-ln-line"> private string GenUrl(string incomeUrl)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="144"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="145"> class="hljs-ln-code"> class="hljs-ln-line"> if (incomeUrl.StartsWith("http://"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="146"> class="hljs-ln-code"> class="hljs-ln-line"> return incomeUrl;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="147"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="148"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="149"> class="hljs-ln-code"> class="hljs-ln-line"> /*
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="150"> class="hljs-ln-code"> class="hljs-ln-line"> *
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="151"> class="hljs-ln-code"> class="hljs-ln-line"> /x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="152"> class="hljs-ln-code"> class="hljs-ln-line"> 直接是base里的(或者窗口TopUrl)顶级域名,加上x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="153"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="154"> class="hljs-ln-code"> class="hljs-ln-line"> x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="155"> class="hljs-ln-code"> class="hljs-ln-line"> 直接是base里的(或者窗口TopUrl)Url,加上x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="156"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="157"> class="hljs-ln-code"> class="hljs-ln-line"> ./x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="158"> class="hljs-ln-code"> class="hljs-ln-line"> 直接是base里的(或者窗口TopUrl)Url,加上x.aspx(同上)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="159"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="160"> class="hljs-ln-code"> class="hljs-ln-line"> ../x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="161"> class="hljs-ln-code"> class="hljs-ln-line"> 直接是base里的(或者窗口TopUrl)Url的上一层,加上x.aspx
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="162"> class="hljs-ln-code"> class="hljs-ln-line"> */
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="163"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="164"> class="hljs-ln-code"> class="hljs-ln-line"> if (incomeUrl.StartsWith("/"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="165"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="166"> class="hljs-ln-code"> class="hljs-ln-line"> string trueUrl = _base_top_url + incomeUrl;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="167"> class="hljs-ln-code"> class="hljs-ln-line"> return trueUrl;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="168"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="169"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="170"> class="hljs-ln-code"> class="hljs-ln-line"> int parent_depth = 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="171"> class="hljs-ln-code"> class="hljs-ln-line"> while (incomeUrl.StartsWith("."))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="172"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="173"> class="hljs-ln-code"> class="hljs-ln-line"> if (incomeUrl.StartsWith("../"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="174"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="175"> class="hljs-ln-code"> class="hljs-ln-line"> parent_depth += 1;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="176"> class="hljs-ln-code"> class="hljs-ln-line"> incomeUrl.Substring(3, incomeUrl.Length - 3);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="177"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="178"> class="hljs-ln-code"> class="hljs-ln-line"> else if (incomeUrl.StartsWith("./"))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="179"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="180"> class="hljs-ln-code"> class="hljs-ln-line"> incomeUrl = incomeUrl.Substring(2, incomeUrl.Length - 2);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="181"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="182"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="183"> class="hljs-ln-code"> class="hljs-ln-line"> return null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="184"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="185"> class="hljs-ln-code"> class="hljs-ln-line"> string head_str = _base_url;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="186"> class="hljs-ln-code"> class="hljs-ln-line"> if (parent_depth > 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="187"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="188"> class="hljs-ln-code"> class="hljs-ln-line"> for (int i = 0; i < parent_depth; i++)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="189"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="190"> class="hljs-ln-code"> class="hljs-ln-line"> int qposition = head_str.LastIndexOf("/");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="191"> class="hljs-ln-code"> class="hljs-ln-line"> if (qposition < 0)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="192"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="193"> class="hljs-ln-code"> class="hljs-ln-line"> // not_http_lenght_not_enough
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="194"> class="hljs-ln-code"> class="hljs-ln-line"> head_str = _base_top_url;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="195"> class="hljs-ln-code"> class="hljs-ln-line"> break;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="196"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="197"> class="hljs-ln-code"> class="hljs-ln-line"> head_str = head_str.Substring(0, qposition);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="198"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="199"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="200"> class="hljs-ln-code"> class="hljs-ln-line"> if (head_str.StartsWith("http:") && head_str.Length < "http://".Length)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="201"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="202"> class="hljs-ln-code"> class="hljs-ln-line"> //不是http开头的,长度不够的情况,在前面 not_http_lenght_not_enough 处理了
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="203"> class="hljs-ln-code"> class="hljs-ln-line"> //说明:如果base_url=http://a.com/b/,但是有../../../a.html,没有足够的目录的情况下
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="204"> class="hljs-ln-code"> class="hljs-ln-line"> //Url是指:http://a.com/b/a.html
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="205"> class="hljs-ln-code"> class="hljs-ln-line"> head_str = _base_top_url;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="206"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="207"> class="hljs-ln-code"> class="hljs-ln-line"> return head_str + "/" + incomeUrl;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="208"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="209"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="210"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="211"> class="hljs-ln-code"> class="hljs-ln-line"> //下载网页内容
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="212"> class="hljs-ln-code"> class="hljs-ln-line"> private bool WebGetContent()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="213"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="214"> class="hljs-ln-code"> class="hljs-ln-line"> HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(_url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="215"> class="hljs-ln-code"> class="hljs-ln-line"> // 设置浏览器代理
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="216"> class="hljs-ln-code"> class="hljs-ln-line"> myRequest.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="217"> class="hljs-ln-code"> class="hljs-ln-line"> //Encoding encode = System.Text.Encoding.GetEncoding(936);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="218"> class="hljs-ln-code"> class="hljs-ln-line"> HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="219"> class="hljs-ln-code"> class="hljs-ln-line"> //myResponse.ContentEncoding;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="220"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="221"> class="hljs-ln-code"> class="hljs-ln-line"> //LogMsg.instance.LOG(LogMsg.DEBUG, "begin to parse response");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="222"> class="hljs-ln-code"> class="hljs-ln-line"> //LogMsg.LogDebug("begin to parse response");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="223"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="224"> class="hljs-ln-code"> class="hljs-ln-line"> if (myResponse.StatusCode != HttpStatusCode.OK)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="225"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="226"> class="hljs-ln-code"> class="hljs-ln-line"> LogMsg.LogError("get url:" + _url + ", error. status:"+myResponse.StatusDescription);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="227"> class="hljs-ln-code"> class="hljs-ln-line"> return false;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="228"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="229"> class="hljs-ln-code"> class="hljs-ln-line"> Stream src_stream = myResponse.GetResponseStream();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="230"> class="hljs-ln-code"> class="hljs-ln-line"> StreamReader myStreamReader = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="231"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="232"> class="hljs-ln-code"> class="hljs-ln-line"> if (myResponse.ContentEncoding != null &&
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="233"> class="hljs-ln-code"> class="hljs-ln-line"> myResponse.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="234"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="235"> class="hljs-ln-code"> class="hljs-ln-line"> myStreamReader = new StreamReader(new GZipStream(src_stream, CompressionMode.Decompress),
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="236"> class="hljs-ln-code"> class="hljs-ln-line"> Encoding.Default);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="237"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="238"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="239"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="240"> class="hljs-ln-code"> class="hljs-ln-line"> myStreamReader = new StreamReader(src_stream,
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="241"> class="hljs-ln-code"> class="hljs-ln-line"> Encoding.Default);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="242"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="243"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="244"> class="hljs-ln-code"> class="hljs-ln-line"> ArrayList response_bytes_array = new ArrayList();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="245"> class="hljs-ln-code"> class="hljs-ln-line"> int i_byte = 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="246"> class="hljs-ln-code"> class="hljs-ln-line"> while (true)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="247"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="248"> class="hljs-ln-code"> class="hljs-ln-line"> i_byte = myStreamReader.BaseStream.ReadByte();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="249"> class="hljs-ln-code"> class="hljs-ln-line"> if (i_byte == -1)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="250"> class="hljs-ln-code"> class="hljs-ln-line"> break;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="251"> class="hljs-ln-code"> class="hljs-ln-line"> response_bytes_array.Add(Convert.ToByte(i_byte));
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="252"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="253"> class="hljs-ln-code"> class="hljs-ln-line"> byte[] response_bytes = new byte[response_bytes_array.Count];
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="254"> class="hljs-ln-code"> class="hljs-ln-line"> for(int i=0;i
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="255"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="256"> class="hljs-ln-code"> class="hljs-ln-line"> response_bytes[i] = Convert.ToByte( response_bytes_array[i]);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="257"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="258"> class="hljs-ln-code"> class="hljs-ln-line"> string tmphtml = Encoding.Default.GetString(response_bytes);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="259"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="260"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="261"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="262"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="263"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="264"> class="hljs-ln-code"> class="hljs-ln-line"> string encoding_name = "";
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="265"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="266"> class="hljs-ln-code"> class="hljs-ln-line"> if (global_var.RegCharset.IsMatch(tmphtml))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="267"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="268"> class="hljs-ln-code"> class="hljs-ln-line"> encoding_name = global_var.RegCharset.Match(tmphtml).Groups["charset"].Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="269"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="270"> class="hljs-ln-code"> class="hljs-ln-line"> else if (myResponse.CharacterSet != string.Empty)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="271"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="272"> class="hljs-ln-code"> class="hljs-ln-line"> encoding_name = myResponse.CharacterSet;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="273"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="274"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="275"> class="hljs-ln-code"> class="hljs-ln-line"> Encoding encoding = null;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="276"> class="hljs-ln-code"> class="hljs-ln-line"> if (encoding_name != null)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="277"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="278"> class="hljs-ln-code"> class="hljs-ln-line"> try
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="279"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="280"> class="hljs-ln-code"> class="hljs-ln-line"> encoding = Encoding.GetEncoding(encoding_name);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="281"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="282"> class="hljs-ln-code"> class="hljs-ln-line"> catch
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="283"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="284"> class="hljs-ln-code"> class="hljs-ln-line"> encoding = Encoding.Default;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="285"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="286"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="287"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="288"> class="hljs-ln-code"> class="hljs-ln-line"> encoding = Encoding.Default;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="289"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="290"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="291"> class="hljs-ln-code"> class="hljs-ln-line"> // 获取到此网页的内容
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="292"> class="hljs-ln-code"> class="hljs-ln-line"> _content = encoding.GetString(response_bytes);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="293"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="294"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="295"> class="hljs-ln-code"> class="hljs-ln-line"> //LogMsg.LogInfo("Encoding:" + encoding.EncodingName + ", encoding_name:"
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="296"> class="hljs-ln-code"> class="hljs-ln-line"> //+ encoding_name);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="297"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="298"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="299"> class="hljs-ln-code"> class="hljs-ln-line"> myStreamReader.Close();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="300"> class="hljs-ln-code"> class="hljs-ln-line"> myResponse.Close();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="301"> class="hljs-ln-code"> class="hljs-ln-line"> //LogMsg.LogDebug("end to parse response");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="302"> class="hljs-ln-code"> class="hljs-ln-line"> return true;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="303"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="304"> class="hljs-ln-code"> class="hljs-ln-line"> public void GetCode()
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="305"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="306"> class="hljs-ln-code"> class="hljs-ln-line"> try
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="307"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="308"> class="hljs-ln-code"> class="hljs-ln-line"> // LogMsg.LogDebug("begin to parse url: [" + _url + "]!");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="309"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="310"> class="hljs-ln-code"> class="hljs-ln-line"> bool get_html = WebGetContent();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="311"> class="hljs-ln-code"> class="hljs-ln-line"> if (!get_html)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="312"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="313"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_error_url(_url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="314"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="315"> class="hljs-ln-code"> class="hljs-ln-line"> LogMsg.LogInfo("url: [" + _url + "] error!");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="316"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="317"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="318"> class="hljs-ln-code"> class="hljs-ln-line"> return;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="319"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="320"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="321"> class="hljs-ln-code"> class="hljs-ln-line"> _content = _content.Replace("&", "&");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="322"> class="hljs-ln-code"> class="hljs-ln-line"> if (global_var.RegBaseUrl.IsMatch(_content))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="323"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="324"> class="hljs-ln-code"> class="hljs-ln-line"> _tag_base_url = global_var.RegBaseUrl.Match(_content).Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="325"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="326"> class="hljs-ln-code"> class="hljs-ln-line"> //解析到有没有base标签后,开始生成本地的链接的相对对象
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="327"> class="hljs-ln-code"> class="hljs-ln-line"> GenBaseUrl();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="328"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="329"> class="hljs-ln-code"> class="hljs-ln-line"> _title = global_var.RegTitle.Match(_content).Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="330"> class="hljs-ln-code"> class="hljs-ln-line"> LogMsg.LogDebug("url: [" + _url + "] title: [" + _title + "]!");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="331"> class="hljs-ln-code"> class="hljs-ln-line"> if (_title == "")
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="332"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="333"> class="hljs-ln-code"> class="hljs-ln-line"> // LogMsg.LogDebug("content:"+_content);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="334"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="335"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="336"> class="hljs-ln-code"> class="hljs-ln-line"> //下面如果Return false,则表明此Url不处于规则中,不用保存,解析其Url即可
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="337"> class="hljs-ln-code"> class="hljs-ln-line"> bool save = RoleSet.Instance.OSave(_url, _content, _title);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="338"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="339"> class="hljs-ln-code"> class="hljs-ln-line"> MatchCollection myMC = global_var.RegAnchor.Matches(_content);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="340"> class="hljs-ln-code"> class="hljs-ln-line"> int num = 0;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="341"> class="hljs-ln-code"> class="hljs-ln-line"> foreach (Match i in myMC)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="342"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="343"> class="hljs-ln-code"> class="hljs-ln-line"> if (i.Value == "")
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="344"> class="hljs-ln-code"> class="hljs-ln-line"> continue;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="345"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="346"> class="hljs-ln-code"> class="hljs-ln-line"> string newUrl = i.Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="347"> class="hljs-ln-code"> class="hljs-ln-line"> int url_type = CheckUrl(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="348"> class="hljs-ln-code"> class="hljs-ln-line"> if (url_type == UrlType.UrlTypeError ||
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="349"> class="hljs-ln-code"> class="hljs-ln-line"> url_type == UrlType.UrlTypeOtherFile ||
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="350"> class="hljs-ln-code"> class="hljs-ln-line"> url_type == UrlType.UrlTypeSelf)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="351"> class="hljs-ln-code"> class="hljs-ln-line"> continue;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="352"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="353"> class="hljs-ln-code"> class="hljs-ln-line"> newUrl = GenUrl(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="354"> class="hljs-ln-code"> class="hljs-ln-line"> if (url_type == UrlType.UrlTypeHtml)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="355"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="356"> class="hljs-ln-code"> class="hljs-ln-line"> bool pass = CheckUrlThisSite(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="357"> class="hljs-ln-code"> class="hljs-ln-line"> if (pass && !UrlSet.instance.IsNoParse(newUrl))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="358"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="359"> class="hljs-ln-code"> class="hljs-ln-line"> //bool pass3 = RoleSet.Instance.OIsMatch(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="360"> class="hljs-ln-code"> class="hljs-ln-line"> //if (pass3)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="361"> class="hljs-ln-code"> class="hljs-ln-line"> //{
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="362"> class="hljs-ln-code"> class="hljs-ln-line"> //LogMsg.LogDebug("add going parse url:"+newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="363"> class="hljs-ln-code"> class="hljs-ln-line"> num++;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="364"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_going_parse_url(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="365"> class="hljs-ln-code"> class="hljs-ln-line"> //}
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="366"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="367"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="368"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="369"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="370"> class="hljs-ln-code"> class="hljs-ln-line"> //如果是文件,图片,不用检查是否本站,但是要规则处检查,如果通过,直接起线程下载之
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="371"> class="hljs-ln-code"> class="hljs-ln-line"> Role role = RoleSet.Instance.OIsMatchRole(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="372"> class="hljs-ln-code"> class="hljs-ln-line"> if (role != null)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="373"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="374"> class="hljs-ln-code"> class="hljs-ln-line"> //下载
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="375"> class="hljs-ln-code"> class="hljs-ln-line"> DownFileTask downfile_task = new DownFileTask();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="376"> class="hljs-ln-code"> class="hljs-ln-line"> downfile_task.SetUrl(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="377"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="378"> class="hljs-ln-code"> class="hljs-ln-line"> string save_as_path = role.GetSaveAsPath();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="379"> class="hljs-ln-code"> class="hljs-ln-line"> string filename = newUrl.GetHashCode().ToString("X");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="380"> class="hljs-ln-code"> class="hljs-ln-line"> filename += "_"+Path.GetFileName(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="381"> class="hljs-ln-code"> class="hljs-ln-line"> save_as_path += filename;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="382"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="383"> class="hljs-ln-code"> class="hljs-ln-line"> downfile_task.SetFilePath(save_as_path);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="384"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="385"> class="hljs-ln-code"> class="hljs-ln-line"> //不另外起线程了,谁叫你保存File的
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="386"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_going_parse_url(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="387"> class="hljs-ln-code"> class="hljs-ln-line"> xu_common.thread.ThreadPoll.Execute(downfile_task);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="388"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="389"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="390"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="391"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="392"> class="hljs-ln-code"> class="hljs-ln-line"> MatchCollection myMC2 = global_var.RegAnchor2.Matches(_content);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="393"> class="hljs-ln-code"> class="hljs-ln-line"> foreach (Match i in myMC2)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="394"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="395"> class="hljs-ln-code"> class="hljs-ln-line"> if (i.Value == "")
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="396"> class="hljs-ln-code"> class="hljs-ln-line"> continue;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="397"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="398"> class="hljs-ln-code"> class="hljs-ln-line"> string newUrl = "http://" + i.Value;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="399"> class="hljs-ln-code"> class="hljs-ln-line"> int url_type = CheckUrl(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="400"> class="hljs-ln-code"> class="hljs-ln-line"> if (url_type == UrlType.UrlTypeError ||
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="401"> class="hljs-ln-code"> class="hljs-ln-line"> url_type == UrlType.UrlTypeOtherFile ||
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="402"> class="hljs-ln-code"> class="hljs-ln-line"> url_type == UrlType.UrlTypeSelf)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="403"> class="hljs-ln-code"> class="hljs-ln-line"> continue;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="404"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="405"> class="hljs-ln-code"> class="hljs-ln-line"> // newUrl = GenUrl(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="406"> class="hljs-ln-code"> class="hljs-ln-line"> if (url_type == UrlType.UrlTypeHtml)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="407"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="408"> class="hljs-ln-code"> class="hljs-ln-line"> bool pass = CheckUrlThisSite(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="409"> class="hljs-ln-code"> class="hljs-ln-line"> if (pass && !UrlSet.instance.IsNoParse(newUrl))
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="410"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="411"> class="hljs-ln-code"> class="hljs-ln-line"> //bool pass3 = RoleSet.Instance.OIsMatch(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="412"> class="hljs-ln-code"> class="hljs-ln-line"> //if (pass3)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="413"> class="hljs-ln-code"> class="hljs-ln-line"> //{
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="414"> class="hljs-ln-code"> class="hljs-ln-line"> //LogMsg.LogDebug("add going parse url:"+newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="415"> class="hljs-ln-code"> class="hljs-ln-line"> num++;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="416"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_going_parse_url(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="417"> class="hljs-ln-code"> class="hljs-ln-line"> //}
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="418"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="419"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="420"> class="hljs-ln-code"> class="hljs-ln-line"> else
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="421"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="422"> class="hljs-ln-code"> class="hljs-ln-line"> //如果是文件,图片,不用检查是否本站,但是要规则处检查,如果通过,直接起线程下载之
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="423"> class="hljs-ln-code"> class="hljs-ln-line"> Role role = RoleSet.Instance.OIsMatchRole(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="424"> class="hljs-ln-code"> class="hljs-ln-line"> if (role != null)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="425"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="426"> class="hljs-ln-code"> class="hljs-ln-line"> //下载
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="427"> class="hljs-ln-code"> class="hljs-ln-line"> DownFileTask downfile_task = new DownFileTask();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="428"> class="hljs-ln-code"> class="hljs-ln-line"> downfile_task.SetUrl(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="429"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="430"> class="hljs-ln-code"> class="hljs-ln-line"> string save_as_path = role.GetSaveAsPath();
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="431"> class="hljs-ln-code"> class="hljs-ln-line"> string filename = newUrl.GetHashCode().ToString("X");
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="432"> class="hljs-ln-code"> class="hljs-ln-line"> filename += "_" + Path.GetFileName(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="433"> class="hljs-ln-code"> class="hljs-ln-line"> save_as_path += filename;
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="434"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="435"> class="hljs-ln-code"> class="hljs-ln-line"> downfile_task.SetFilePath(save_as_path);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="436"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="437"> class="hljs-ln-code"> class="hljs-ln-line"> //不另外起线程了,谁叫你保存File的
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="438"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_going_parse_url(newUrl);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="439"> class="hljs-ln-code"> class="hljs-ln-line"> xu_common.thread.ThreadPoll.Execute(downfile_task);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="440"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="441"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="442"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="443"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="444"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_parsed_url(_url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="445"> class="hljs-ln-code"> class="hljs-ln-line"> LogMsg.LogDebug(_url + ", have:" + num.ToString() + " urls, now going parse url num:" + UrlSet.instance.going_to_parse_url_num() + (save ? "save" : "nosave"));
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="446"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="447"> class="hljs-ln-code"> class="hljs-ln-line"> catch(Exception ex)
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="448"> class="hljs-ln-code"> class="hljs-ln-line"> {
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="449"> class="hljs-ln-code"> class="hljs-ln-line"> LogMsg.LogError("error:" + ex.ToString());
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="450"> class="hljs-ln-code"> class="hljs-ln-line"> UrlSet.instance.add_error_url(_url);
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="451"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="452"> class="hljs-ln-code"> class="hljs-ln-line">
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="453"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="454"> class="hljs-ln-code"> class="hljs-ln-line"> }
  • class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="455"> class="hljs-ln-code"> class="hljs-ln-line">}
  • class="hide-preCode-box"> class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">

    三、管理和调度任务

    这里管理和调度任务不再赘述,如上面下载文件和图片一样,可以使用多线程进行下载。这些管理代码不难,且上面已有示例,下载网页的任务调度类似。保存在Url队列中,按顺序启动多线程去获取内容。

    四、保存数据

    这块代码比较隐私。博主在上面的代码处也有说明。是根据对应的网站,在获取网页源码内容后,需要自己写规则解析,解析到对应的内容后,直接调用DB操作类写入到数据库。

    以上就是本次分享的内容。如果有兴趣的,欢迎关注博主私聊。

    博主其它经典原创:《管理心得--工作目标应该是解决业务问题,而非感动自己》,《管理心得--如何高效进行跨部门合作》,《管理心得--员工最容易犯的错误:以错误去掩盖错误》,《技术心得--如何成为优秀的架构师》、《管理心得--如何成为优秀的架构师》、《管理心理--程序员如何选择职业赛道》。欢迎大家阅读。

    >>
    注:本文转载自blog.csdn.net的借雨醉东风的文章"https://blog.csdn.net/weixin_60437218/article/details/136842621"。版权归原作者所有,此博客不拥有其著作权,亦不承担相应法律责任。如有侵权,请联系我们删除。
    复制链接

    评论记录:

    未查询到任何数据!