-
思路
1.进入网站,F12或者工具打开控制台工具
Fetch/XHR过滤查看一下有什么异步请求之类的
左上角的工具可以查看下布局情况,尤其是html里面的<div>,<img>,<a>等元素
- 知道逻辑后,打开GPT帮你写就行了,然后你再改改,基本使用的库就两三个
#http组件
import requests
#html网页类的解析工具
from bs4 import BeautifulSoup
import json
#多线程可以用这个
import concurrent.futures
3.修改代码
- 通过
respone = requests.post/.get(url)
获得网页 - 使用
soup = BeautifulSoup(response.text, 'html.parser')
转化为搜索的html数据 - 通过soup.find('div' id = 'xxx' class_ = 'xxx')等方式快速定位到你要检索的空间
- soup返回的也是soup,只需要使用,find,find_all,find,加入不同的条件,就能找到你的内容物
例子 找到所有<div id = "shop_list">下面的<a href="http://xxx.com" alt = "a_name">标签
a_list = soup.find('div',id = 'shop_list').find_all('a' href=True)
for each_a in a_list
src = each_a['href'] #获取<a>标签的url
name = each_a['alt']#获取<a>标签的名字
以下是一个完整的商品爬虫
内容大致流程:
#完整代码,GPT生成了大半了
import requests
from bs4 import BeautifulSoup
import json
import concurrent.futures
#定义数据结构
class Product:
def __init__(self, id, name, brand, img_url, base_url, brand_url,model):
self.id = id
self.name = name
self.brand = brand
self.img_url = img_url
self.base_url = base_url
self.brand_url = brand_url
self.model = model
def to_dict(self):
"""Converts the object into a dictionary."""
return {
'id': self.id,
'name': self.name,
'brand': self.brand,
'img_url': self.img_url,
'base_url': self.base_url,
'brand_url': self.brand_url,
'model' : self.model
}
@classmethod
def from_dict(cls, data):
"""Create an instance of Product from a dictionary."""
return cls(
data['id'],
data['name'],
data['brand'],
data['img_url'],
data['base_url'],
data['brand_url'],
data['model']
)
def __str__(self):
return f"Product(id={self.id}, name='{self.name}', brand='{self.brand}', img_url='{self.img_url}', base_url='{self.base_url}', brand_url='{self.brand_url}, model={self.model}')"
#返回 所有品牌的[url, name]数组
def extract_brand_links(url):
base_all_shop = []
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 直接定位到包含品牌的ul标签
brand_list = soup.find('ul', id='brand_li')
if brand_list:
# 查找ul下的所有a标签
brand_links = brand_list.find_all('a')
for link in brand_links:
href = link.get('href')
if href: # 确保链接存在
brand_url = url + href
brand_name = link.find('img')['alt']
each_brand = [brand_url,brand_name]
base_all_shop.append(each_brand)
# 提取并保存链接
with open("brand_links.txt", "w") as file:
for link_brand in base_all_shop:
file.write(link_brand[0] +" "+ link_brand[1] + "\n") # 写入链接并添加换行符
except Exception as e:
print(f"An error occurred: {e}")
return base_all_shop
#清晰的根据每个品牌的分页进行整合数据base_brand[0]就是品牌的url地址
def fetch_brand_links_from_pages(base_brand,base_url):
all_product = []
# 访问首页获取总页数
home_response = requests.get(base_brand[0])
home_soup = BeautifulSoup(home_response.text, 'html.parser')
pagination = home_soup.find('ul', class_='pages')
if pagination:
page_links = pagination.find_all('a', href=True)
pages= pagination.find_all('li')
max_pages = (len(pages) - 2)#这个网站的分页可以根据(il列表数量-2)获得最高页数
for page_each in range(max_pages):
# 构造当前页的URL
page_url = f"{base_brand[0]}?page={(page_each+1)}"
page_response = requests.get(page_url)
if page_response.status_code == 200:
page_soup = BeautifulSoup(page_response.text, 'html.parser')
product_list = page_soup.find('ul', id='lay_goods')
if product_list:
product_links = product_list.find_all('a', href=True)
for link in product_links:
product_id = link['href'] .split('id-')[-1].split('.html')[0]#得到商品的id
product_link = base_url + link['href'] # 拼接完整商品链接
product_name = link.find('div').find('img')['alt'] # 获取到商品的名字
product_img_url = link.find('div').find('img')['lay-src'] # 获取到图片的链接
product = Product(product_id, product_name,base_brand[1], product_img_url, product_link, base_brand[0],False)
print(product)
all_product.append(product)
return all_product
def main():
# 基础URL(去掉了末尾的"/")
base_url = "https://www.pytestweb.com"
# 获取所有商品链接
brand_links = extract_brand_links(base_url)
total_products = []
for product_link in brand_links:
each_products = fetch_brand_links_from_pages(product_link,base_url)
total_products.extend(each_products)
products_dicts = [product.to_dict() for product in total_products]
with open("products.json", "w") as file:
json.dump(products_dicts, file, ensure_ascii=False, indent=4)
print("Product links have been saved to products.json")