class="hljs-ln-code"> class="hljs-ln-line"> def _init_browser(): class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line"> """ 配置 Selenium 浏览器驱动 """ class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line"> driver = webdriver.Edge() class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line"> return driver class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
(三)搜*狐*搜索网站的url设置
搜*狐*搜索的网址为'https://search.sohu.com/',经过测试只需要设置keyword参数就可获取到搜索结果,其它的参数可以不用设置,故此,url设置成如下形式即可:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line">kw = '要搜索的关键字'
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">url = f'https://search.sohu.com/?keyword={kw}'
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
(四)搜*狐*搜索结果返回形式
搜*狐*搜索是动态网页,输入关键字点搜索后,会显示10条结果,将浏览器右侧的滑动条往下拖,滑动到页面底部,会刷出新的搜索结果,每拖一次增加10条结果。为了获得更多的结果,需要在浏览器中多拖几次。我在代码中设置了滑动到底部5次,可以获得50条结果,为了防止程序执行过快,结果还没有刷新出来,在执行一次滑动到页面底部操作后就用延迟1.5秒。
另外,初次加载页面也需要等待一段时间,这里使用了wait.until方法,检查id为'new-list-loading'的元素是否出现,以判定页面是否加载完毕。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line"> # 打开页面
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line"> driver.get(url)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line"> # 显性等待页面中的搜索结果加载完成
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line"> wait = WebDriverWait(driver, 60)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line"> wait.until(ec.visibility_of_element_located((By.ID, 'new-list-loading'))) # 此为搜索结果后面跟的元素
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line"> for i in range(5):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line"> # 滑动到页面底端,执行循环操作是为了尽量多加载搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line"> driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line"> time.sleep(1.5)
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
(五)数据的抓取
通过网页浏览器的开发人员工具对页面数据进行分析,页面中的搜索结果的所在的DIV元素都有‘data-index'和'data-spm-data’,因此通过find_elements方法抓取包含着两个特征的DIV原始就可以获得所有的搜索结果。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line"># 提取包含搜索结果的关键节点
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">result_blocks = driver.find_elements(By.CSS_SELECTOR, 'div[data-index][data-spm-data]')
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
但上述的操作包含了网页源代码中的所有内容,而我们只是需要其中的一些关键数据,如新闻标题、链接、内容摘要、数据来源、发布时间等信息。这就需要对上面操作得到的result_blocks做进一步处理,这里不细说了,可见后面的代码展示。
(六)实现效果
使用selenium来进行爬取的效果如下:

(调用Edge浏览器,自动填入url并执行搜索和爬取操作)

(pycharm中运行本代码的状况)

(结果保存到了xlsx文件中,因省略了部分代码,此展示只抓取了标题和链接信息)
三、代码展示
最后放上完整代码,供参考。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line">from selenium import webdriver
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">from selenium.webdriver.common.by import By
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line">from selenium.webdriver.support.ui import WebDriverWait
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line">from selenium.webdriver.support import expected_conditions as ec
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line">from queue import Queue
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line">from datetime import datetime
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line">import time
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line">import xlsxwriter
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="11"> class="hljs-ln-code"> class="hljs-ln-line">class CrawlSohu:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="12"> class="hljs-ln-code"> class="hljs-ln-line"> """ 爬取搜狐搜索的结果 """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="13"> class="hljs-ln-code"> class="hljs-ln-line"> WEBSITE = '搜狐' # 网站标识
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="14"> class="hljs-ln-code"> class="hljs-ln-line"> URL = 'https://search.sohu.com/'
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="15"> class="hljs-ln-code"> class="hljs-ln-line"> DELAY_MIN = 2 # 延时最小值
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="16"> class="hljs-ln-code"> class="hljs-ln-line"> DELAY_MAX = 5 # 延时最大值
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="17"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="18"> class="hljs-ln-code"> class="hljs-ln-line"> def __init__(self, queue: Queue):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="19"> class="hljs-ln-code"> class="hljs-ln-line"> self._queue = queue # Queue对象,用来存放日志信息
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="20"> class="hljs-ln-code"> class="hljs-ln-line"> self.mOder = 0 # 序号
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="21"> class="hljs-ln-code"> class="hljs-ln-line"> self.mPage = 0 # 页号
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="22"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="23"> class="hljs-ln-code"> class="hljs-ln-line"> @staticmethod
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="24"> class="hljs-ln-code"> class="hljs-ln-line"> def _init_browser():
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="25"> class="hljs-ln-code"> class="hljs-ln-line"> """ 配置 Selenium 浏览器驱动 """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="26"> class="hljs-ln-code"> class="hljs-ln-line"> driver = webdriver.Edge() # 使用Windows自带的Edge浏览器
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="27"> class="hljs-ln-code"> class="hljs-ln-line"> return driver
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="28"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="29"> class="hljs-ln-code"> class="hljs-ln-line"> def crawl_sohu(self, kw: str):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="30"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="31"> class="hljs-ln-code"> class="hljs-ln-line"> 爬取搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="32"> class="hljs-ln-code"> class="hljs-ln-line"> :param kw: 关键字
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="33"> class="hljs-ln-code"> class="hljs-ln-line"> :return results: 获取到的搜索结果,列表形式
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="34"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="35"> class="hljs-ln-code"> class="hljs-ln-line"> results = [] # 存储所有搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="36"> class="hljs-ln-code"> class="hljs-ln-line"> driver = self._init_browser()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="37"> class="hljs-ln-code"> class="hljs-ln-line"> search_para = f"?keyword={kw}"
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="38"> class="hljs-ln-code"> class="hljs-ln-line"> url = self.URL + search_para
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="39"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="40"> class="hljs-ln-code"> class="hljs-ln-line"> self.mPage = 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="41"> class="hljs-ln-code"> class="hljs-ln-line"> m_now = datetime.now()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="42"> class="hljs-ln-code"> class="hljs-ln-line"> now = m_now.strftime("%Y-%m-%d %H:%M:%S")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="43"> class="hljs-ln-code"> class="hljs-ln-line"> self._queue.put(f'[{now}]: {self.WEBSITE} 开始爬虫任务\n')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="44"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="45"> class="hljs-ln-code"> class="hljs-ln-line"> try:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="46"> class="hljs-ln-code"> class="hljs-ln-line"> # 打开页面
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="47"> class="hljs-ln-code"> class="hljs-ln-line"> driver.get(url)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="48"> class="hljs-ln-code"> class="hljs-ln-line"> # 显性等待页面中的搜索结果加载完成
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="49"> class="hljs-ln-code"> class="hljs-ln-line"> wait = WebDriverWait(driver, 60)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="50"> class="hljs-ln-code"> class="hljs-ln-line"> wait.until(ec.visibility_of_element_located((By.ID, 'new-list-loading'))) # 此为搜索结果后面跟的元素
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="51"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="52"> class="hljs-ln-code"> class="hljs-ln-line"> for i in range(5):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="53"> class="hljs-ln-code"> class="hljs-ln-line"> # 滑动到页面底端,执行循环操作是为了尽量多加载搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="54"> class="hljs-ln-code"> class="hljs-ln-line"> driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="55"> class="hljs-ln-code"> class="hljs-ln-line"> time.sleep(1.5)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="56"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="57"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取页面源代码
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="58"> class="hljs-ln-code"> class="hljs-ln-line"> data = self._parse_html_by_selenium(driver)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="59"> class="hljs-ln-code"> class="hljs-ln-line"> results.extend(data)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="60"> class="hljs-ln-code"> class="hljs-ln-line"> except Exception as e:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="61"> class="hljs-ln-code"> class="hljs-ln-line"> print(f"{self.WEBSITE} 发生错误:", e)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="62"> class="hljs-ln-code"> class="hljs-ln-line"> finally:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="63"> class="hljs-ln-code"> class="hljs-ln-line"> driver.quit() # 关闭浏览器
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="64"> class="hljs-ln-code"> class="hljs-ln-line"> print(f"{self.WEBSITE} 采集结束!")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="65"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="66"> class="hljs-ln-code"> class="hljs-ln-line"> now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="67"> class="hljs-ln-code"> class="hljs-ln-line"> self._queue.put(f'[{now}]: {self.WEBSITE} 任务完成,爬取{self.mPage}页,获取结果{self.mOder}条\n')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="68"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="69"> class="hljs-ln-code"> class="hljs-ln-line"> # 返回采集到的所有结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="70"> class="hljs-ln-code"> class="hljs-ln-line"> return results
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="71"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="72"> class="hljs-ln-code"> class="hljs-ln-line"> def _parse_html_by_selenium(self, driver: webdriver):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="73"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="74"> class="hljs-ln-code"> class="hljs-ln-line"> 网页文本分析(selenium)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="75"> class="hljs-ln-code"> class="hljs-ln-line"> :param driver: webDriver对象
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="76"> class="hljs-ln-code"> class="hljs-ln-line"> :return data: 返回的分析结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="77"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="78"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取包含搜索结果的关键节点
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="79"> class="hljs-ln-code"> class="hljs-ln-line"> result_blocks = driver.find_elements(By.CSS_SELECTOR, 'div[data-index][data-spm-data]')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="80"> class="hljs-ln-code"> class="hljs-ln-line"> print(f'{self.WEBSITE} 第 {self.mPage} 页 结果数量为:{len(result_blocks)}')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="81"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="82"> class="hljs-ln-code"> class="hljs-ln-line"> data = [] # 存放返回结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="83"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取数据
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="84"> class="hljs-ln-code"> class="hljs-ln-line"> for block in result_blocks:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="85"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取内容tag(有两种类型的结构)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="86"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag1 = block.find_elements(By.XPATH, './/div[@class="plain-content"]')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="87"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag2 = block.find_elements(By.XPATH, './/div[@class="cards-content-right"]')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="88"> class="hljs-ln-code"> class="hljs-ln-line"> if content_tag1:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="89"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag = content_tag1[0]
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="90"> class="hljs-ln-code"> class="hljs-ln-line"> elif content_tag2:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="91"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag = content_tag2[0]
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="92"> class="hljs-ln-code"> class="hljs-ln-line"> else:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="93"> class="hljs-ln-code"> class="hljs-ln-line"> continue
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="94"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取标题和链接
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="95"> class="hljs-ln-code"> class="hljs-ln-line"> title_link = content_tag.find_elements(By.TAG_NAME, 'a')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="96"> class="hljs-ln-code"> class="hljs-ln-line"> if title_link:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="97"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取标题
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="98"> class="hljs-ln-code"> class="hljs-ln-line"> title = title_link[0].text.strip()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="99"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取链接
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="100"> class="hljs-ln-code"> class="hljs-ln-line"> link = title_link[0].get_attribute('href')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="101"> class="hljs-ln-code"> class="hljs-ln-line"> else:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="102"> class="hljs-ln-code"> class="hljs-ln-line"> continue
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="103"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取内容摘要
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="104"> class="hljs-ln-code"> class="hljs-ln-line"> content = ''
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="105"> class="hljs-ln-code"> class="hljs-ln-line"> # 省略了具体操作
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="106"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="107"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取来源和发布日期
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="108"> class="hljs-ln-code"> class="hljs-ln-line"> source = ''
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="109"> class="hljs-ln-code"> class="hljs-ln-line"> release_time = ''
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="110"> class="hljs-ln-code"> class="hljs-ln-line"> # 省略了具体操作
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="111"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="112"> class="hljs-ln-code"> class="hljs-ln-line"> # 数据清理
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="113"> class="hljs-ln-code"> class="hljs-ln-line"> title = title.replace(" ", '') # 清除空格
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="114"> class="hljs-ln-code"> class="hljs-ln-line"> title = title.replace("\n", '') # 清除换行符
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="115"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="116"> class="hljs-ln-code"> class="hljs-ln-line"> self.mOder += 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="117"> class="hljs-ln-code"> class="hljs-ln-line"> # 将本次搜索结果添加到data中
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="118"> class="hljs-ln-code"> class="hljs-ln-line"> data.append([self.mOder, self.WEBSITE, title, link, content, source, release_time])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="119"> class="hljs-ln-code"> class="hljs-ln-line"> # 屏幕上输出
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="120"> class="hljs-ln-code"> class="hljs-ln-line"> print(self.mOder, ". ", title)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="121"> class="hljs-ln-code"> class="hljs-ln-line"> print(link)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="122"> class="hljs-ln-code"> class="hljs-ln-line"> print(" ")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="123"> class="hljs-ln-code"> class="hljs-ln-line"> return data
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="124"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="125"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="126"> class="hljs-ln-code"> class="hljs-ln-line">def print_logs(s_queue: Queue):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="127"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="128"> class="hljs-ln-code"> class="hljs-ln-line"> 输出日志
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="129"> class="hljs-ln-code"> class="hljs-ln-line"> :param s_queue: 队列,内部存放了爬虫日志
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="130"> class="hljs-ln-code"> class="hljs-ln-line"> :return:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="131"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="132"> class="hljs-ln-code"> class="hljs-ln-line"> print('输出日志')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="133"> class="hljs-ln-code"> class="hljs-ln-line"> while True:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="134"> class="hljs-ln-code"> class="hljs-ln-line"> if not s_queue.empty():
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="135"> class="hljs-ln-code"> class="hljs-ln-line"> data = s_queue.get()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="136"> class="hljs-ln-code"> class="hljs-ln-line"> print(f"{data}")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="137"> class="hljs-ln-code"> class="hljs-ln-line"> else:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="138"> class="hljs-ln-code"> class="hljs-ln-line"> break
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="139"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="140"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="141"> class="hljs-ln-code"> class="hljs-ln-line">def save_data(website, kw, num, datas):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="142"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="143"> class="hljs-ln-code"> class="hljs-ln-line"> 将数据保存到xlsx文件中
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="144"> class="hljs-ln-code"> class="hljs-ln-line"> :param website: 网站标识
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="145"> class="hljs-ln-code"> class="hljs-ln-line"> :param kw: 关键字
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="146"> class="hljs-ln-code"> class="hljs-ln-line"> :param num: 记录数量
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="147"> class="hljs-ln-code"> class="hljs-ln-line"> :param datas: 搜索结果列表
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="148"> class="hljs-ln-code"> class="hljs-ln-line"> :return:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="149"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="150"> class="hljs-ln-code"> class="hljs-ln-line"> import os
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="151"> class="hljs-ln-code"> class="hljs-ln-line"> subdirectory_name = 'results'
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="152"> class="hljs-ln-code"> class="hljs-ln-line"> # 检查子目录是否存在,不存在就创建
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="153"> class="hljs-ln-code"> class="hljs-ln-line"> if not os.path.exists(subdirectory_name):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="154"> class="hljs-ln-code"> class="hljs-ln-line"> os.makedirs(subdirectory_name)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="155"> class="hljs-ln-code"> class="hljs-ln-line"> # 设置文件名
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="156"> class="hljs-ln-code"> class="hljs-ln-line"> file_name = f"{website}-{kw}-数据{num}条.xlsx"
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="157"> class="hljs-ln-code"> class="hljs-ln-line"> file_path = os.path.join(subdirectory_name, file_name)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="158"> class="hljs-ln-code"> class="hljs-ln-line"> workbook = xlsxwriter.Workbook(file_path) # 创建excel
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="159"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet = workbook.add_worksheet(f"{website}-{kw}")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="160"> class="hljs-ln-code"> class="hljs-ln-line"> # 创建表头
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="161"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 0, "序号")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="162"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 1, "网站")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="163"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 2, "标题")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="164"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 3, "链接")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="165"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 4, "内容摘要")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="166"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 5, "信息来源")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="167"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 6, "时间")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="168"> class="hljs-ln-code"> class="hljs-ln-line"> # 写入数据
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="169"> class="hljs-ln-code"> class="hljs-ln-line"> row = 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="170"> class="hljs-ln-code"> class="hljs-ln-line"> for data in datas:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="171"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 0, data[0])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="172"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 1, data[1])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="173"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 2, data[2])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="174"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 3, data[3])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="175"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 4, data[4])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="176"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 5, data[5])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="177"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 6, data[6])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="178"> class="hljs-ln-code"> class="hljs-ln-line"> row += 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="179"> class="hljs-ln-code"> class="hljs-ln-line"> # 设置列宽
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="180"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('C:C', 45) # 标题列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="181"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('D:D', 30) # 链接列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="182"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('E:E', 60) # 内容摘要列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="183"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('F:F', 15) # 来源列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="184"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('G:G', 15) # 时间列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="185"> class="hljs-ln-code"> class="hljs-ln-line"> # 关闭工作簿
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="186"> class="hljs-ln-code"> class="hljs-ln-line"> workbook.close()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="187"> class="hljs-ln-code"> class="hljs-ln-line"> print('数据已保存到xlsx文件中')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="188"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="189"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="190"> class="hljs-ln-code"> class="hljs-ln-line">if __name__ == '__main__':
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="191"> class="hljs-ln-code"> class="hljs-ln-line"> m_kw = '美国Tik Tok难民' # 设置关键字
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="192"> class="hljs-ln-code"> class="hljs-ln-line"> m_queue = Queue() # 用于传递日志信息
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="193"> class="hljs-ln-code"> class="hljs-ln-line"> # 执行爬虫任务
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="194"> class="hljs-ln-code"> class="hljs-ln-line"> c_sohu = CrawlSohu(m_queue) # 创建爬虫任务对象
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="195"> class="hljs-ln-code"> class="hljs-ln-line"> m_results = c_sohu.crawl_sohu(m_kw) # 获取爬虫任务结果,结果为列表形式
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="196"> class="hljs-ln-code"> class="hljs-ln-line"> count = len(m_results)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="197"> class="hljs-ln-code"> class="hljs-ln-line"> print(f'\n搜索结果数量:{count}')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="198"> class="hljs-ln-code"> class="hljs-ln-line"> # 打印日志
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="199"> class="hljs-ln-code"> class="hljs-ln-line"> print_logs(m_queue)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="200"> class="hljs-ln-code"> class="hljs-ln-line"> # 将爬取的结果保存到xlsx文件中
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="201"> class="hljs-ln-code"> class="hljs-ln-line"> save_data('搜狐', m_kw, count, m_results)
class="hide-preCode-box">
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
data-report-view="{"mod":"1585297308_001","spm":"1001.2101.3001.6548","dest":"https://blog.csdn.net/bahamutj/article/details/145359543","extend1":"pc","ab":"new"}">>
一、引言
我从24年11月份开始学习网络爬虫应用开发,今年2个来月的努力,于这两天终于完成了开发一款网络爬虫软件的目标。这里对本次软件开发进行一下回顾总结。
在之前的学习中,我是尝试了用requests和BeautifulSoup库来实现爬虫任务,但在测试过程中有部分网站的反爬措施会让爬取任务失败(比如搜*狐),这给我的网络爬虫软件开发造成了很大的麻烦。后来通过不断的学习,发现使用selenium库来进行爬取,使得爬取任务更像是人类的上网浏览行为,能够有效避开这些网站的反爬机制。
之后,我将所有爬虫任务模块代码都重写了一遍,全部改成了用selenium库来实现,今天就用搜*狐作为样板,展示一下学习成果。
二、功能实现
(一)用到的库
本日志中的代码用到了以下几个库:
selenium:是一个用于Web应用程序测试的工具,可以模拟真实用户在浏览器中的操作,广泛应用于自动化测试和数据抓取领域。用于实现数据爬取。
queue:提供了线程安全的队列实现,可以有效地解决多线程编程中数据共享和同步的问题。因爬虫任务放在多线程中执行,爬虫的日志信息通过queue传递给主线程。
datetime:是用于处理日期和时间的强大工具。用于获取日期和时间。
time:也是处理与时间相关的库。用到了time.sleep方法实现延迟。
xlsxwriter:用于将爬取的数据保存到xlsx中。
(二)配置 Selenium 浏览器驱动
Selenium支持多种浏览器,我只尝试了Chrome和Edge,发现使用Chrome时,每次打开浏览器都要很长时间,而使用Edge则快很多,因此,我采用了Edge。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1">
class="hljs-ln-code"> class="hljs-ln-line"> @staticmethod class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line"> def _init_browser(): class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line"> """ 配置 Selenium 浏览器驱动 """ class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line"> driver = webdriver.Edge() class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line"> return driver class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
(三)搜*狐*搜索网站的url设置
搜*狐*搜索的网址为'https://search.sohu.com/',经过测试只需要设置keyword参数就可获取到搜索结果,其它的参数可以不用设置,故此,url设置成如下形式即可:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line">kw = '要搜索的关键字'
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">url = f'https://search.sohu.com/?keyword={kw}'
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
(四)搜*狐*搜索结果返回形式
搜*狐*搜索是动态网页,输入关键字点搜索后,会显示10条结果,将浏览器右侧的滑动条往下拖,滑动到页面底部,会刷出新的搜索结果,每拖一次增加10条结果。为了获得更多的结果,需要在浏览器中多拖几次。我在代码中设置了滑动到底部5次,可以获得50条结果,为了防止程序执行过快,结果还没有刷新出来,在执行一次滑动到页面底部操作后就用延迟1.5秒。
另外,初次加载页面也需要等待一段时间,这里使用了wait.until方法,检查id为'new-list-loading'的元素是否出现,以判定页面是否加载完毕。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line"> # 打开页面
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line"> driver.get(url)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line"> # 显性等待页面中的搜索结果加载完成
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line"> wait = WebDriverWait(driver, 60)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line"> wait.until(ec.visibility_of_element_located((By.ID, 'new-list-loading'))) # 此为搜索结果后面跟的元素
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line"> for i in range(5):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line"> # 滑动到页面底端,执行循环操作是为了尽量多加载搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line"> driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line"> time.sleep(1.5)
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
(五)数据的抓取
通过网页浏览器的开发人员工具对页面数据进行分析,页面中的搜索结果的所在的DIV元素都有‘data-index'和'data-spm-data’,因此通过find_elements方法抓取包含着两个特征的DIV原始就可以获得所有的搜索结果。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line"># 提取包含搜索结果的关键节点
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">result_blocks = driver.find_elements(By.CSS_SELECTOR, 'div[data-index][data-spm-data]')
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
但上述的操作包含了网页源代码中的所有内容,而我们只是需要其中的一些关键数据,如新闻标题、链接、内容摘要、数据来源、发布时间等信息。这就需要对上面操作得到的result_blocks做进一步处理,这里不细说了,可见后面的代码展示。
(六)实现效果
使用selenium来进行爬取的效果如下:

(调用Edge浏览器,自动填入url并执行搜索和爬取操作)

(pycharm中运行本代码的状况)

(结果保存到了xlsx文件中,因省略了部分代码,此展示只抓取了标题和链接信息)
三、代码展示
最后放上完整代码,供参考。
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="1"> class="hljs-ln-code"> class="hljs-ln-line">from selenium import webdriver
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="2"> class="hljs-ln-code"> class="hljs-ln-line">from selenium.webdriver.common.by import By
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="3"> class="hljs-ln-code"> class="hljs-ln-line">from selenium.webdriver.support.ui import WebDriverWait
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="4"> class="hljs-ln-code"> class="hljs-ln-line">from selenium.webdriver.support import expected_conditions as ec
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="5"> class="hljs-ln-code"> class="hljs-ln-line">from queue import Queue
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="6"> class="hljs-ln-code"> class="hljs-ln-line">from datetime import datetime
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="7"> class="hljs-ln-code"> class="hljs-ln-line">import time
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="8"> class="hljs-ln-code"> class="hljs-ln-line">import xlsxwriter
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="9"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="10"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="11"> class="hljs-ln-code"> class="hljs-ln-line">class CrawlSohu:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="12"> class="hljs-ln-code"> class="hljs-ln-line"> """ 爬取搜狐搜索的结果 """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="13"> class="hljs-ln-code"> class="hljs-ln-line"> WEBSITE = '搜狐' # 网站标识
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="14"> class="hljs-ln-code"> class="hljs-ln-line"> URL = 'https://search.sohu.com/'
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="15"> class="hljs-ln-code"> class="hljs-ln-line"> DELAY_MIN = 2 # 延时最小值
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="16"> class="hljs-ln-code"> class="hljs-ln-line"> DELAY_MAX = 5 # 延时最大值
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="17"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="18"> class="hljs-ln-code"> class="hljs-ln-line"> def __init__(self, queue: Queue):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="19"> class="hljs-ln-code"> class="hljs-ln-line"> self._queue = queue # Queue对象,用来存放日志信息
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="20"> class="hljs-ln-code"> class="hljs-ln-line"> self.mOder = 0 # 序号
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="21"> class="hljs-ln-code"> class="hljs-ln-line"> self.mPage = 0 # 页号
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="22"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="23"> class="hljs-ln-code"> class="hljs-ln-line"> @staticmethod
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="24"> class="hljs-ln-code"> class="hljs-ln-line"> def _init_browser():
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="25"> class="hljs-ln-code"> class="hljs-ln-line"> """ 配置 Selenium 浏览器驱动 """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="26"> class="hljs-ln-code"> class="hljs-ln-line"> driver = webdriver.Edge() # 使用Windows自带的Edge浏览器
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="27"> class="hljs-ln-code"> class="hljs-ln-line"> return driver
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="28"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="29"> class="hljs-ln-code"> class="hljs-ln-line"> def crawl_sohu(self, kw: str):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="30"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="31"> class="hljs-ln-code"> class="hljs-ln-line"> 爬取搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="32"> class="hljs-ln-code"> class="hljs-ln-line"> :param kw: 关键字
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="33"> class="hljs-ln-code"> class="hljs-ln-line"> :return results: 获取到的搜索结果,列表形式
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="34"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="35"> class="hljs-ln-code"> class="hljs-ln-line"> results = [] # 存储所有搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="36"> class="hljs-ln-code"> class="hljs-ln-line"> driver = self._init_browser()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="37"> class="hljs-ln-code"> class="hljs-ln-line"> search_para = f"?keyword={kw}"
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="38"> class="hljs-ln-code"> class="hljs-ln-line"> url = self.URL + search_para
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="39"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="40"> class="hljs-ln-code"> class="hljs-ln-line"> self.mPage = 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="41"> class="hljs-ln-code"> class="hljs-ln-line"> m_now = datetime.now()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="42"> class="hljs-ln-code"> class="hljs-ln-line"> now = m_now.strftime("%Y-%m-%d %H:%M:%S")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="43"> class="hljs-ln-code"> class="hljs-ln-line"> self._queue.put(f'[{now}]: {self.WEBSITE} 开始爬虫任务\n')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="44"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="45"> class="hljs-ln-code"> class="hljs-ln-line"> try:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="46"> class="hljs-ln-code"> class="hljs-ln-line"> # 打开页面
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="47"> class="hljs-ln-code"> class="hljs-ln-line"> driver.get(url)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="48"> class="hljs-ln-code"> class="hljs-ln-line"> # 显性等待页面中的搜索结果加载完成
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="49"> class="hljs-ln-code"> class="hljs-ln-line"> wait = WebDriverWait(driver, 60)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="50"> class="hljs-ln-code"> class="hljs-ln-line"> wait.until(ec.visibility_of_element_located((By.ID, 'new-list-loading'))) # 此为搜索结果后面跟的元素
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="51"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="52"> class="hljs-ln-code"> class="hljs-ln-line"> for i in range(5):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="53"> class="hljs-ln-code"> class="hljs-ln-line"> # 滑动到页面底端,执行循环操作是为了尽量多加载搜索结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="54"> class="hljs-ln-code"> class="hljs-ln-line"> driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="55"> class="hljs-ln-code"> class="hljs-ln-line"> time.sleep(1.5)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="56"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="57"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取页面源代码
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="58"> class="hljs-ln-code"> class="hljs-ln-line"> data = self._parse_html_by_selenium(driver)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="59"> class="hljs-ln-code"> class="hljs-ln-line"> results.extend(data)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="60"> class="hljs-ln-code"> class="hljs-ln-line"> except Exception as e:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="61"> class="hljs-ln-code"> class="hljs-ln-line"> print(f"{self.WEBSITE} 发生错误:", e)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="62"> class="hljs-ln-code"> class="hljs-ln-line"> finally:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="63"> class="hljs-ln-code"> class="hljs-ln-line"> driver.quit() # 关闭浏览器
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="64"> class="hljs-ln-code"> class="hljs-ln-line"> print(f"{self.WEBSITE} 采集结束!")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="65"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="66"> class="hljs-ln-code"> class="hljs-ln-line"> now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="67"> class="hljs-ln-code"> class="hljs-ln-line"> self._queue.put(f'[{now}]: {self.WEBSITE} 任务完成,爬取{self.mPage}页,获取结果{self.mOder}条\n')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="68"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="69"> class="hljs-ln-code"> class="hljs-ln-line"> # 返回采集到的所有结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="70"> class="hljs-ln-code"> class="hljs-ln-line"> return results
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="71"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="72"> class="hljs-ln-code"> class="hljs-ln-line"> def _parse_html_by_selenium(self, driver: webdriver):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="73"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="74"> class="hljs-ln-code"> class="hljs-ln-line"> 网页文本分析(selenium)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="75"> class="hljs-ln-code"> class="hljs-ln-line"> :param driver: webDriver对象
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="76"> class="hljs-ln-code"> class="hljs-ln-line"> :return data: 返回的分析结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="77"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="78"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取包含搜索结果的关键节点
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="79"> class="hljs-ln-code"> class="hljs-ln-line"> result_blocks = driver.find_elements(By.CSS_SELECTOR, 'div[data-index][data-spm-data]')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="80"> class="hljs-ln-code"> class="hljs-ln-line"> print(f'{self.WEBSITE} 第 {self.mPage} 页 结果数量为:{len(result_blocks)}')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="81"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="82"> class="hljs-ln-code"> class="hljs-ln-line"> data = [] # 存放返回结果
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="83"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取数据
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="84"> class="hljs-ln-code"> class="hljs-ln-line"> for block in result_blocks:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="85"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取内容tag(有两种类型的结构)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="86"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag1 = block.find_elements(By.XPATH, './/div[@class="plain-content"]')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="87"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag2 = block.find_elements(By.XPATH, './/div[@class="cards-content-right"]')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="88"> class="hljs-ln-code"> class="hljs-ln-line"> if content_tag1:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="89"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag = content_tag1[0]
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="90"> class="hljs-ln-code"> class="hljs-ln-line"> elif content_tag2:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="91"> class="hljs-ln-code"> class="hljs-ln-line"> content_tag = content_tag2[0]
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="92"> class="hljs-ln-code"> class="hljs-ln-line"> else:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="93"> class="hljs-ln-code"> class="hljs-ln-line"> continue
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="94"> class="hljs-ln-code"> class="hljs-ln-line"> # 提取标题和链接
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="95"> class="hljs-ln-code"> class="hljs-ln-line"> title_link = content_tag.find_elements(By.TAG_NAME, 'a')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="96"> class="hljs-ln-code"> class="hljs-ln-line"> if title_link:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="97"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取标题
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="98"> class="hljs-ln-code"> class="hljs-ln-line"> title = title_link[0].text.strip()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="99"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取链接
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="100"> class="hljs-ln-code"> class="hljs-ln-line"> link = title_link[0].get_attribute('href')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="101"> class="hljs-ln-code"> class="hljs-ln-line"> else:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="102"> class="hljs-ln-code"> class="hljs-ln-line"> continue
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="103"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取内容摘要
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="104"> class="hljs-ln-code"> class="hljs-ln-line"> content = ''
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="105"> class="hljs-ln-code"> class="hljs-ln-line"> # 省略了具体操作
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="106"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="107"> class="hljs-ln-code"> class="hljs-ln-line"> # 获取来源和发布日期
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="108"> class="hljs-ln-code"> class="hljs-ln-line"> source = ''
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="109"> class="hljs-ln-code"> class="hljs-ln-line"> release_time = ''
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="110"> class="hljs-ln-code"> class="hljs-ln-line"> # 省略了具体操作
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="111"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="112"> class="hljs-ln-code"> class="hljs-ln-line"> # 数据清理
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="113"> class="hljs-ln-code"> class="hljs-ln-line"> title = title.replace(" ", '') # 清除空格
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="114"> class="hljs-ln-code"> class="hljs-ln-line"> title = title.replace("\n", '') # 清除换行符
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="115"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="116"> class="hljs-ln-code"> class="hljs-ln-line"> self.mOder += 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="117"> class="hljs-ln-code"> class="hljs-ln-line"> # 将本次搜索结果添加到data中
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="118"> class="hljs-ln-code"> class="hljs-ln-line"> data.append([self.mOder, self.WEBSITE, title, link, content, source, release_time])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="119"> class="hljs-ln-code"> class="hljs-ln-line"> # 屏幕上输出
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="120"> class="hljs-ln-code"> class="hljs-ln-line"> print(self.mOder, ". ", title)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="121"> class="hljs-ln-code"> class="hljs-ln-line"> print(link)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="122"> class="hljs-ln-code"> class="hljs-ln-line"> print(" ")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="123"> class="hljs-ln-code"> class="hljs-ln-line"> return data
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="124"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="125"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="126"> class="hljs-ln-code"> class="hljs-ln-line">def print_logs(s_queue: Queue):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="127"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="128"> class="hljs-ln-code"> class="hljs-ln-line"> 输出日志
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="129"> class="hljs-ln-code"> class="hljs-ln-line"> :param s_queue: 队列,内部存放了爬虫日志
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="130"> class="hljs-ln-code"> class="hljs-ln-line"> :return:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="131"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="132"> class="hljs-ln-code"> class="hljs-ln-line"> print('输出日志')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="133"> class="hljs-ln-code"> class="hljs-ln-line"> while True:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="134"> class="hljs-ln-code"> class="hljs-ln-line"> if not s_queue.empty():
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="135"> class="hljs-ln-code"> class="hljs-ln-line"> data = s_queue.get()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="136"> class="hljs-ln-code"> class="hljs-ln-line"> print(f"{data}")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="137"> class="hljs-ln-code"> class="hljs-ln-line"> else:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="138"> class="hljs-ln-code"> class="hljs-ln-line"> break
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="139"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="140"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="141"> class="hljs-ln-code"> class="hljs-ln-line">def save_data(website, kw, num, datas):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="142"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="143"> class="hljs-ln-code"> class="hljs-ln-line"> 将数据保存到xlsx文件中
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="144"> class="hljs-ln-code"> class="hljs-ln-line"> :param website: 网站标识
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="145"> class="hljs-ln-code"> class="hljs-ln-line"> :param kw: 关键字
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="146"> class="hljs-ln-code"> class="hljs-ln-line"> :param num: 记录数量
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="147"> class="hljs-ln-code"> class="hljs-ln-line"> :param datas: 搜索结果列表
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="148"> class="hljs-ln-code"> class="hljs-ln-line"> :return:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="149"> class="hljs-ln-code"> class="hljs-ln-line"> """
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="150"> class="hljs-ln-code"> class="hljs-ln-line"> import os
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="151"> class="hljs-ln-code"> class="hljs-ln-line"> subdirectory_name = 'results'
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="152"> class="hljs-ln-code"> class="hljs-ln-line"> # 检查子目录是否存在,不存在就创建
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="153"> class="hljs-ln-code"> class="hljs-ln-line"> if not os.path.exists(subdirectory_name):
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="154"> class="hljs-ln-code"> class="hljs-ln-line"> os.makedirs(subdirectory_name)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="155"> class="hljs-ln-code"> class="hljs-ln-line"> # 设置文件名
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="156"> class="hljs-ln-code"> class="hljs-ln-line"> file_name = f"{website}-{kw}-数据{num}条.xlsx"
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="157"> class="hljs-ln-code"> class="hljs-ln-line"> file_path = os.path.join(subdirectory_name, file_name)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="158"> class="hljs-ln-code"> class="hljs-ln-line"> workbook = xlsxwriter.Workbook(file_path) # 创建excel
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="159"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet = workbook.add_worksheet(f"{website}-{kw}")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="160"> class="hljs-ln-code"> class="hljs-ln-line"> # 创建表头
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="161"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 0, "序号")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="162"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 1, "网站")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="163"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 2, "标题")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="164"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 3, "链接")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="165"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 4, "内容摘要")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="166"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 5, "信息来源")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="167"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(0, 6, "时间")
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="168"> class="hljs-ln-code"> class="hljs-ln-line"> # 写入数据
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="169"> class="hljs-ln-code"> class="hljs-ln-line"> row = 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="170"> class="hljs-ln-code"> class="hljs-ln-line"> for data in datas:
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="171"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 0, data[0])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="172"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 1, data[1])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="173"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 2, data[2])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="174"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 3, data[3])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="175"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 4, data[4])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="176"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 5, data[5])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="177"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.write(row, 6, data[6])
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="178"> class="hljs-ln-code"> class="hljs-ln-line"> row += 1
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="179"> class="hljs-ln-code"> class="hljs-ln-line"> # 设置列宽
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="180"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('C:C', 45) # 标题列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="181"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('D:D', 30) # 链接列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="182"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('E:E', 60) # 内容摘要列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="183"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('F:F', 15) # 来源列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="184"> class="hljs-ln-code"> class="hljs-ln-line"> worksheet.set_column('G:G', 15) # 时间列
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="185"> class="hljs-ln-code"> class="hljs-ln-line"> # 关闭工作簿
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="186"> class="hljs-ln-code"> class="hljs-ln-line"> workbook.close()
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="187"> class="hljs-ln-code"> class="hljs-ln-line"> print('数据已保存到xlsx文件中')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="188"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="189"> class="hljs-ln-code"> class="hljs-ln-line">
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="190"> class="hljs-ln-code"> class="hljs-ln-line">if __name__ == '__main__':
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="191"> class="hljs-ln-code"> class="hljs-ln-line"> m_kw = '美国Tik Tok难民' # 设置关键字
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="192"> class="hljs-ln-code"> class="hljs-ln-line"> m_queue = Queue() # 用于传递日志信息
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="193"> class="hljs-ln-code"> class="hljs-ln-line"> # 执行爬虫任务
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="194"> class="hljs-ln-code"> class="hljs-ln-line"> c_sohu = CrawlSohu(m_queue) # 创建爬虫任务对象
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="195"> class="hljs-ln-code"> class="hljs-ln-line"> m_results = c_sohu.crawl_sohu(m_kw) # 获取爬虫任务结果,结果为列表形式
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="196"> class="hljs-ln-code"> class="hljs-ln-line"> count = len(m_results)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="197"> class="hljs-ln-code"> class="hljs-ln-line"> print(f'\n搜索结果数量:{count}')
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="198"> class="hljs-ln-code"> class="hljs-ln-line"> # 打印日志
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="199"> class="hljs-ln-code"> class="hljs-ln-line"> print_logs(m_queue)
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="200"> class="hljs-ln-code"> class="hljs-ln-line"> # 将爬取的结果保存到xlsx文件中
- class="hljs-ln-numbers"> class="hljs-ln-line hljs-ln-n" data-line-number="201"> class="hljs-ln-code"> class="hljs-ln-line"> save_data('搜狐', m_kw, count, m_results)
class="hide-preCode-box">
class="hljs-button signin active" data-title="登录复制" data-report-click="{"spm":"1001.2101.3001.4334"}" onclick="hljs.signin(event)">
data-report-view="{"mod":"1585297308_001","spm":"1001.2101.3001.6548","dest":"https://blog.csdn.net/bahamutj/article/details/145359543","extend1":"pc","ab":"new"}">>
评论记录:
回复评论: