Python Selenium爬虫入门到进阶指南

本质

python第三方库 selenium 控制 浏览器驱动
浏览器驱动控制浏览器

  • 推荐 edge 浏览器驱动(不容易遇到版本或者兼容性的问题)
  • 驱动下载网址:链接: link
  • 1、实战1

    (1)安装 selenium 库

    pip install selenium
    

    (2)将驱动文件exe放在py文件同级目录下

    (3)初步体验驱动器控制浏览器

    # 从selenium库中导入webdriver模块
    from selenium import webdriver
    # 从selenium.webdriver.common.by模块中导入By类,用于定位元素
    from selenium.webdriver.common.by import By
    # 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
    from selenium.webdriver.common.keys import Keys
    import time
    
    # 若你想用Edge浏览器,使用下面这行代码,前提是msedgedriver.exe路径正确
    driver = webdriver.Edge("./msedgedriver.exe")
    
    # get方法会一直等到页面被完全加载,然后才会继续程序
    driver.get("http://www.baidu.com/")
    
    # id="kw" 是百度搜索输入框,输入字符串 "长城"
    driver.find_element(By.CSS_SELECTOR, "#kw").send_keys("长城")
    
    # id="su" 是百度搜索按钮,click() 是模拟点击
    driver.find_element(By.CSS_SELECTOR, "#su").click()
    
    # 为了便于观察搜索结果,等待 5 秒
    time.sleep(5)
    # 关闭浏览器
    driver.quit()
    

    (4)元素定位

    1.获取单个元素——元素不存在会报错

    # 通过 ID 定位元素
    element_by_id = driver.find_element(By.ID, "inputOriginal")
    
    # 通过 CSS 选择器定位元素 id-#
    element_by_css = driver.find_element(By.CSS_SELECTOR, "#inputOriginal")
    
    # 通过标签名定位元素
    element_by_tag = driver.find_element(By.TAG_NAME, "div")
    
    # 通过 name 属性定位元素
    element_by_name = driver.find_element(By.NAME, "username")
    
    # 通过链接文本定位元素
    element_by_link_text = driver.find_element(By.LINK_TEXT, "下一页")
    

    2.获取多个元素——返回列表(元素不存在返回空)

    # 通过 ID 定位多个元素
    elements_by_id = driver.find_elements(By.ID, "inputOriginal")
    
    # 通过 CSS 选择器定位多个元素
    elements_by_css = driver.find_elements(By.CSS_SELECTOR, "#inputOriginal")
    
    # 通过标签名定位多个元素
    elements_by_tag = driver.find_elements(By.TAG_NAME, "div")
    
    # 通过 name 属性定位多个元素
    elements_by_name = driver.find_elements(By.NAME, "username")
    
    # 通过链接文本定位多个元素
    elements_by_link_text = driver.find_elements(By.LINK_TEXT, "下一页")
    
    # 后续可对定位到的元素列表进行操作,例如遍历元素列表
    for element in elements_by_id:
        print(element.text)
    

    2、实战2:访问有道翻译,获取翻译后的内容

    # 从selenium库中导入webdriver模块
    from selenium import webdriver
    # 从selenium.webdriver.common.by模块中导入By类,用于定位元素
    from selenium.webdriver.common.by import By
    # 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
    from selenium.webdriver.common.keys import Keys
    import time
    
    # 若你想用Edge浏览器,使用下面这行代码,前提是msedgedriver.exe路径正确
    driver = webdriver.Edge("./msedgedriver.exe")
    
    # 加载有道翻译页面
    driver.get("https://fanyi.youdao.com/#/TextTranslate")
    
    # 等待页面加载
    time.sleep(2)
    
    # 获取输入框
    input_box = driver.find_element(By.ID, "js_fanyi_input")
    
    # 输入内容
    input_box.send_keys("hello")
    
    # 等待翻译完成
    time.sleep(2)
    
    # 获取翻译后的内容
    transTarget = driver.find_element(By.ID, "js_fanyi_output_resultOutput")
    print(transTarget.text)
    # 为了便于观察搜索结果,等待 5 秒
    time.sleep(5)
    # 关闭浏览器
    driver.quit()
    

    3、实战3:爬取当当网站商品信息

    (1)内容获取

    (2)窗口操作

    (3)实战

    # 从selenium库中导入webdriver模块
    from selenium import webdriver
    # 从selenium.webdriver.common.by模块中导入By类,用于定位元素
    from selenium.webdriver.common.by import By
    # 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
    from selenium.webdriver.common.keys import Keys
    import time
    
    # 若你想用Edge浏览器,使用下面这行代码,前提是msedgedriver.exe路径正确
    driver = webdriver.Edge("./msedgedriver.exe")
    
    # 加载当当网
    driver.get("https://www.dangdang.com/")
    
    # 等待页面加载
    time.sleep(2)
    
    # 获取输入框
    key = driver.find_element(By.ID, "key_S")
    key.send_keys("科幻")
    
    # 获取搜索框,点击搜索
    search = driver.find_element(By.CSS_SELECTOR, "#form_search_new .button")
    search.click()
    
    # 等待搜索结果页面加载
    time.sleep(3)
    
    # 获取商品标题及价格,循环 5 页
    for i in range(5):
        shoplist = driver.find_elements(By.CSS_SELECTOR, ".bigimg li") # bigimg 下的所有 li标签
        for li in shoplist:
            try:
                title = li.find_element(By.CSS_SELECTOR, "a").get_attribute("title")
                print(title)
            except Exception as e:
                print(f"获取商品标题时出错: {e}")
    
            try:
                price = li.find_element(By.CSS_SELECTOR, ".search_now_price").text
                print(price)
            except Exception as e:
                print(f"获取商品价格时出错: {e}")
    
        # 获取下一页按钮
        try:
            next_page = driver.find_element(By.LINK_TEXT, "下一页")
            next_page.click()
            # 等待下一页加载
            time.sleep(3)
        except Exception as e:
            print(f"点击下一页时出错: {e}")
            break
    
    # 关闭浏览器
    driver.close()
    




    (4)css选择器基本规则

    (5)等待——显式/隐式

    1.隐式:全局,只要找元素,没出来就等max_time(自定义)

    driver = webdriver.Edge("./msedgedriver.exe")
    driver.implicitly_wait(30)
    

    2.显式:特定条件下的等待:webDriverWait+until+(判断条件)

    # 程序每 0.5 秒检查,是否满足:标题包含 “百度一下” 这个条件,
    # 检查是否满足条件的最长时间为:15 秒,超过 15 秒仍未满足条件则抛出异常
    try:
        WebDriverWait(driver, 15, 0.5).until(EC.title_contains("百度一下"))
        print("页面标题包含 '百度一下'")
    except Exception as e:
        print(f"等待页面标题时出现异常: {e}")
    
    # 假设要定位的元素 CSS 选择器为 ".example-element",需根据实际情况修改
    element_selector = ".example-element"
    
    # 程序每 0.5 秒检查,是否满足:某定位的元素出现,
    # 检查是否满足条件的最长时间为:15 秒,超过 15 秒仍未满足条件则抛出异常
    try:
        WebDriverWait(driver, 15, 0.5).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, element_selector))
        )
        print(f"元素 {element_selector} 已可见")
    except Exception as e:
        print(f"等待元素可见时出现异常: {e}")
    
    # 关闭浏览器
    driver.quit()
    

    4、实战4:鼠标及键盘操作(动作链)


    driver.get("https://www.baidu.com/")
    more=driver.find_element(By.LINK_TEXT,"更多")
    link_element = driver.find_element(By.CSS_SELECTOR, 'a[name="tj_fanyi"]')
    baike = link_element.get_attribute('href')
    #将鼠标移动到更多按钮
    ActionChains(driver).move_to_element(more).move_to_element(link_element).click().perform()
    

    其他:滚动条,窗口截图


    5、实战5:爬取知乎数据(应对反爬、滑动验证)

    (1)方法一——opencv轮廓检测,由面积和周长确定起始和终止位置

    # 从selenium库中导入webdriver模块
    from selenium import webdriver
    # 从selenium.webdriver.common.by模块中导入By类,用于定位元素
    from selenium.webdriver.common.by import By
    # 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
    from selenium.webdriver import ActionChains
    # 导入显式等待类
    from selenium.webdriver.support.wait import WebDriverWait
    # 导入等待条件类
    from selenium.webdriver.support import expected_conditions as EC
    # 保存图片
    from urllib import request
    # 计算机图像识别
    import cv2
    # 反爬应对
    import random
    # 反爬应对
    import time
    # ------------------------------------------
    # 1、创建 driver
    driver = webdriver.Edge("./msedgedriver.exe")
    driver.get("https://www.zhihu.com/")
    driver.maximize_window()
    # 2、输入用户名、密码(一系列鼠标点击动作)
    dl = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-tabs > div:nth-child(2)")
    ActionChains(driver).move_to_element(dl).click().perform()
    dh = driver.find_element(By.CSS_SELECTOR,"#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-account > div > label > input")
    dh.send_keys("15735188768")
    mm = driver.find_element(By.CSS_SELECTOR,"#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-password > div > label > input")
    mm.send_keys('wy062600')
    login = driver.find_element(By.CSS_SELECTOR,"#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > button")
    ActionChains(driver).move_to_element(login).click().perform()
    # 3、显式等待直到滑动窗口的出现
    WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg")))
    pic = driver.find_element(By.CSS_SELECTOR,'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_bg-img')
    imgsrc = pic.get_attribute("src")
    # 4、获取图片并保存
    request.urlretrieve(imgsrc,'img.png')
    # 5、定义函数,获取轮廓位置
    def get_pos(imageSrc):
        image = cv2.imread(imageSrc) # 利用cv2读取图片
        blurred = cv2.GaussianBlur(image, (5, 5), 0, 0)
        canny = cv2.Canny(blurred, 0, 100)
        contours, hierarchy = cv2.findContours(canny, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        print(len(contours))
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            area = cv2.contourArea(contour)
            zhouchang = cv2.arcLength(contour, True)
            if 5025 < area < 7225 and 300 < zhouchang < 380:
                x, y, w, h = cv2.boundingRect(contour)
                cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 2)
                cv2.imwrite("111.jpg", image)
                return x
            return 0
    
    dis = get_pos('img.png')
    smallImage = driver.find_element(By.CSS_SELECTOR,'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_jigsaw')
    
    dis = int(dis * 340 / 672 - smallImage.location['x'])
    driver.implicitly_wait(2000)
    ActionChains(driver).click_and_hold(smallImage).perform()  # 按下按钮
    i = 0
    moved = 0
    while moved < dis:
        x = random.randint(3, 10)
        moved += x
        ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
        print("第{}次移动后, 位置为{}".format(i, smallImage.location['x']))
        i += 1
    ActionChains(driver).release().perform()
    time.sleep(20000)
    
    
    # 关闭浏览器
    driver.close()
    

    canny = cv2.Canny(blurred, 低阈值, 高阈值)

    (2)方法二——opencv灰度检测确定起始和终止位置

    def calculate_slide_distance(full_image, slider_image):
        """
        计算滑块需要滑动的距离
        :param full_image: 完整背景图
        :param slider_image: 滑块图
        :return: 滑动距离
        """
        if full_image is None or slider_image is None:
            print("图片数据为空,无法计算滑动距离")
            return 0
    
        # 灰度化
        gray_full = cv2.cvtColor(full_image, cv2.COLOR_BGR2GRAY)
        gray_slider = cv2.cvtColor(slider_image, cv2.COLOR_BGR2GRAY)
        # 边缘检测
        edges_full = cv2.Canny(gray_full, 50, 150)
        edges_slider = cv2.Canny(gray_slider, 50, 150)
        # 模板匹配,这里使用TM_CCOEFF_NORMED方法
        result = cv2.matchTemplate(edges_full, edges_slider, cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
        print(f"匹配的相似度值: {max_val}")  # 打印匹配的相似度值
        distance = max_loc[0]
        print(f"计算得到的滑块滑动距离: {distance}")  # 打印计算得到的距离
        return distance
    
    
    # 计算滑动距离
    full_image = cv2.imread('img.png')
    slider_image = cv2.imread('img2.png')
    distance = calculate_slide_distance(full_image, slider_image)
    

    (3)最终结果

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver import ActionChains
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from urllib import request
    import cv2
    import random
    import time
    import csv
    import os
    
    
    # 1、创建 driver
    driver = webdriver.Edge("./msedgedriver.exe")
    driver.get("https://www.zhihu.com/")
    driver.maximize_window()
    
    # 2、输入用户名、密码(一系列鼠标点击动作)
    dl = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-tabs > div:nth-child(2)")
    ActionChains(driver).move_to_element(dl).click().perform()
    dh = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-account > div > label > input")
    dh.send_keys("15735188768")
    mm = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-password > div > label > input")
    mm.send_keys('wy062600')
    login = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > button")
    ActionChains(driver).move_to_element(login).click().perform()
    
    # 3、显式等待直到滑动窗口的出现
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg")))
    pic = driver.find_element(By.CSS_SELECTOR, 'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_bg-img')
    imgsrc = pic.get_attribute("src") # 获取链接
    request.urlretrieve(imgsrc,'img1.png')# 下载图片
    pic2 = driver.find_element(By.CSS_SELECTOR,'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_jigsaw')
    imgsrc2 = pic2.get_attribute("src")
    request.urlretrieve(imgsrc2,'img2.png')
    # ----------------------------------------------------------
    # 4.1、法一:灰度检测
    def calculate_slide_distance(full_image, slider_image):
        """
        计算滑块需要滑动的距离
        :param full_image: 完整背景图
        :param slider_image: 滑块图
        :return: 滑动距离
        """
        if full_image is None or slider_image is None:
            print("图片数据为空,无法计算滑动距离")
            return 0
    
        # 灰度化
        gray_full = cv2.cvtColor(full_image, cv2.COLOR_BGR2GRAY)
        gray_slider = cv2.cvtColor(slider_image, cv2.COLOR_BGR2GRAY)
        # 边缘检测
        edges_full = cv2.Canny(gray_full, 50, 150)
        edges_slider = cv2.Canny(gray_slider, 50, 150)
        # 模板匹配,这里使用TM_CCOEFF_NORMED方法
        result = cv2.matchTemplate(edges_full, edges_slider, cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
        print(f"匹配的相似度值: {max_val}")  # 打印匹配的相似度值
        distance = max_loc[0]
        print(f"计算得到的滑块滑动距离: {distance}")  # 打印计算得到的距离
        return distance
    # 4.1.1、计算滑动距离
    full_image = cv2.imread('img1.png')
    slider_image = cv2.imread('img2.png')
    distance = calculate_slide_distance(full_image, slider_image)
    # --------------------------------------
    # 4.2、法2:轮廓边界
    def get_pos(imageSrc):
        image = cv2.imread(imageSrc) # 利用cv2读取图片
        blurred = cv2.GaussianBlur(image, (5, 5), 0, 0)
        canny = cv2.Canny(blurred, 0, 100)
        contours, hierarchy = cv2.findContours(canny, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        print(len(contours))
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            area = cv2.contourArea(contour)
            zhouchang = cv2.arcLength(contour, True)
            if 5025 < area < 7225 and 300 < zhouchang < 380:
                x, y, w, h = cv2.boundingRect(contour)
                cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 2)
                cv2.imwrite("111.jpg", image)
                return x
            return 0
    # 4.2.1 法2需要计算距离   
    dis = get_pos('img.png')
    dis = int(dis * 340 / 672 - pic2.location['x'])
    driver.implicitly_wait(2000)
    ActionChains(driver).click_and_hold(pic2).perform()  # 按下按钮
    # 4.2.2 反爬操作:
    i = 0
    moved = 0
    while moved < dis:
        x = random.randint(3, 10)
        moved += x
        ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
        print("第{}次移动后, 位置为{}".format(i, pic2.location['x']))
        i += 1
    ActionChains(driver).release().perform()
        
    # 5、等待页面加载完成
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".AppHeader-inner css-11p8nt5")))
    
    # 6、定义 CSV 文件路径
    csv_file_path = "zhihu_data3.csv"
    
    # 7、写入 CSV 文件
    def write_to_csv(data):
        with open(csv_file_path, mode="a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(data)
    
    # 8、写入 CSV 表头
    if not os.path.exists(csv_file_path):
        with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(["author_name", "title", "item_id", "has_image", "upvote_num"])
    # 9、滚动加载更多内容
    def scroll_to_load_more(max_scrolls=10):
        scroll_count = 0
        last_height = driver.execute_script("return document.body.scrollHeight")
        while scroll_count < max_scrolls:
            # 滚动到底部
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # 等待新内容加载
    
            # 计算新的页面高度
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:  # 如果没有新内容加载,退出循环
                break
            last_height = new_height
            scroll_count += 1
    
    scroll_to_load_more(max_scrolls=30)
    # 10、提取数据
    articles = driver.find_elements(By.CSS_SELECTOR, ".ContentItem.ArticleItem, .ContentItem.AnswerItem")
    for article in articles:
        try:
            # 提取 authorName
            author_name = article.get_attribute("data-zop")
            if author_name:
                author_name = eval(author_name).get("authorName", "未知作者")
            else:
                author_name = "未知作者"
    
            # 提取 title
            title_element = article.find_element(By.CSS_SELECTOR, "h2.ContentItem-title a")
            title = title_element.text
    
            # 提取 itemId
            item_id = article.get_attribute("data-zop")
            if item_id:
                item_id = eval(item_id).get("itemId", "未知ID")
            else:
                item_id = "未知ID"
    
            # 提取 has_image
            has_image = False  # 默认值
            try:
                image_element = article.find_element(By.CSS_SELECTOR, ".RichContent-inner img")
                if image_element:
                    has_image = True
            except:
                pass
    
            # 提取 upvote_num
            upvote_num = 0
            try:
                upvote_element = article.find_element(By.CSS_SELECTOR, ".VoteButton--up")
                upvote_num = int(upvote_element.text.replace("赞同", "").strip())
            except:
                pass
    
            # 打印提取的数据
            print(f"作者: {author_name}")
            print(f"标题: {title}")
            print(f"文章ID: {item_id}")
            print(f"是否有图片: {has_image}")
            print(f"点赞数: {upvote_num}")
            print("-" * 50)
    
            # 写入 CSV 文件
            data = [author_name, title, item_id, has_image, upvote_num]
            write_to_csv(data)
        except Exception as e:
            print(f"提取数据时出错: {e}")
    time.sleep(2000)
    # 关闭浏览器
    driver.quit()
    
    

    作者:༺ Dorothy ༻

    物联沃分享整理
    物联沃-IOTWORD物联网 » Python Selenium爬虫入门到进阶指南

    发表回复