Python爬某网站数据

思路
1.进入网站,F12或者工具打开控制台工具
Fetch/XHR过滤查看一下有什么异步请求之类的
左上角的工具可以查看下布局情况,尤其是html里面的<div>,<img>,<a>等元素

image.png

知道逻辑后,打开GPT帮你写就行了,然后你再改改,基本使用的库就两三个

#http组件
import requests
#html网页类的解析工具
from bs4 import BeautifulSoup
import json
#多线程可以用这个
import concurrent.futures

3.修改代码

通过respone = requests.post/.get(url)获得网页
使用soup = BeautifulSoup(response.text, 'html.parser')转化为搜索的html数据
通过soup.find('div' id = 'xxx' class_ = 'xxx')等方式快速定位到你要检索的空间
soup返回的也是soup,只需要使用,find,find_all,find,加入不同的条件,就能找到你的内容物

例子找到所有<div id = "shop_list">下面的<a href="http://xxx.com" alt = "a_name">标签

a_list = soup.find('div',id = 'shop_list').find_all('a' href=True)
for each_a in a_list
  src = each_a['href'] #获取<a>标签的url
  name = each_a['alt']#获取<a>标签的名字

以下是一个完整的商品爬虫

内容大致流程:

image.png

#完整代码,GPT生成了大半了
import requests
from bs4 import BeautifulSoup
import json
import concurrent.futures


#定义数据结构
class Product:
    def __init__(self, id, name, brand, img_url, base_url, brand_url,model):
        self.id = id
        self.name = name
        self.brand = brand
        self.img_url = img_url
        self.base_url = base_url
        self.brand_url = brand_url
        self.model = model

    def to_dict(self):
        """Converts the object into a dictionary."""
        return {
            'id': self.id,
            'name': self.name,
            'brand': self.brand,
            'img_url': self.img_url,
            'base_url': self.base_url,
            'brand_url': self.brand_url,
            'model' : self.model
        }
    
    @classmethod
    def from_dict(cls, data):
        """Create an instance of Product from a dictionary."""
        return cls(
            data['id'],
            data['name'],
            data['brand'],
            data['img_url'],
            data['base_url'],
            data['brand_url'],
            data['model']
        )

    def __str__(self):
        return f"Product(id={self.id}, name='{self.name}', brand='{self.brand}', img_url='{self.img_url}', base_url='{self.base_url}', brand_url='{self.brand_url}, model={self.model}')"


#返回 所有品牌的[url, name]数组
def extract_brand_links(url):
    base_all_shop = []

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 直接定位到包含品牌的ul标签
            brand_list = soup.find('ul', id='brand_li')
            if brand_list:
                # 查找ul下的所有a标签
                brand_links = brand_list.find_all('a')
                for link in brand_links:
                    href = link.get('href')
                    if href:  # 确保链接存在
                        brand_url = url + href
                        brand_name = link.find('img')['alt']
                        each_brand = [brand_url,brand_name]
                        base_all_shop.append(each_brand)
                
                # 提取并保存链接
                with open("brand_links.txt", "w") as file:
                    for link_brand in base_all_shop:
                        file.write(link_brand[0] +"  "+ link_brand[1] + "\n")  # 写入链接并添加换行符
    except Exception as e:
        print(f"An error occurred: {e}")
    return base_all_shop


#清晰的根据每个品牌的分页进行整合数据base_brand[0]就是品牌的url地址
def fetch_brand_links_from_pages(base_brand,base_url):
    all_product = []
    
    # 访问首页获取总页数
    home_response = requests.get(base_brand[0])
    home_soup = BeautifulSoup(home_response.text, 'html.parser')
    pagination = home_soup.find('ul', class_='pages')
    if pagination:
        page_links = pagination.find_all('a', href=True)
        pages= pagination.find_all('li')
        max_pages = (len(pages) - 2)#这个网站的分页可以根据(il列表数量-2)获得最高页数

        for page_each in range(max_pages):
            # 构造当前页的URL
            page_url = f"{base_brand[0]}?page={(page_each+1)}"
            page_response = requests.get(page_url)
            if page_response.status_code == 200:
                page_soup = BeautifulSoup(page_response.text, 'html.parser')
                product_list = page_soup.find('ul', id='lay_goods')
                
                if product_list:
                    product_links = product_list.find_all('a', href=True)
                    for link in product_links:
                        product_id = link['href'] .split('id-')[-1].split('.html')[0]#得到商品的id
                        product_link = base_url + link['href']  # 拼接完整商品链接
                        product_name = link.find('div').find('img')['alt'] # 获取到商品的名字
                        product_img_url = link.find('div').find('img')['lay-src'] # 获取到图片的链接
                        product = Product(product_id, product_name,base_brand[1], product_img_url, product_link, base_brand[0],False)
                        print(product)
                        all_product.append(product)
    return all_product

def main():
        # 基础URL（去掉了末尾的"/"）
    base_url = "https://www.pytestweb.com"


    # 获取所有商品链接
    brand_links = extract_brand_links(base_url)

    total_products = []
    for product_link in brand_links:
        each_products = fetch_brand_links_from_pages(product_link,base_url)
        total_products.extend(each_products)

    products_dicts = [product.to_dict() for product in total_products]

    with open("products.json", "w") as file:
        json.dump(products_dicts, file, ensure_ascii=False, indent=4)
    
    print("Product links have been saved to products.json")

Python爬某网站数据

例子 找到所有<div id = "shop_list">下面的<a href="http://xxx.com" alt = "a_name">标签

以下是一个完整的商品爬虫

例子找到所有<div id = "shop_list">下面的<a href="http://xxx.com" alt = "a_name">标签