爬虫实战——客路网商品内容爬取

本次目标是将客路网商品爬取,并储存至MongoDB中

源代码

import requests
import pymongo
import re
import json
import pandas as pd
import time
import random


def getheaders():
    user_agent_list = ["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "]
    UserAgent=random.choice(user_agent_list)
    return UserAgent


headers = {
"cookie": "abtest_revamp=1563875646033; device_id_new=ryEGX8eZpJ00300000000000005B8Gc9qXKS00314982965WpYWiKzBGKGAWkn1DGD5S16Goh5Mk004Kht7irbDUr00000YVxEr0000041IK5K68xk78dzoTmemq:40::f9905f43e6590003; _gcl_au=1.1.31383824.1563875650; tag_fok=1563875648000; _ga=GA1.2.1742545335.1563875651; _pxvid=e628e4c7-ad2f-11e9-bcd0-0242ac12000e; klk_lang=zh-CN; __stripe_mid=beb6214f-fedc-4ec7-bd1e-8a336755b064; _gcl_aw=GCL.1563934680.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; _gac_UA-54803406-1=1.1563934680.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; _gac_UA-86696233-1=1.1563934685.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; klk_currency=CNY; _gid=GA1.2.389333073.1565751780; px-abgroup=A; px-abper=100; webp_support=1; retina_support=0; JSESSIONID=280B2DDE101FA68E5B8D0A2BA2695BC0; CSRF-Token=MTU2NTkzNTA5NXxOREFWd2tQeldtTERCWXZaTW9ucjdJTXJXR05Xc1drQ3w47gIhUzoBKj9lGBxwUkIl6sSj0_z_cw8tPAMy6kA9bw==; CSRF-Token-Valid=valid; mp_c2ca8b423fd75a10792debf44cd6b51a_mixpanel=%7B%22distinct_id%22%3A%20%2216c1e409f9f2cc-08e5cb5145a7c4-37607c04-13c680-16c1e409fa031c%22%2C%22%24device_id%22%3A%20%2216c1e409f9f2cc-08e5cb5145a7c4-37607c04-13c680-16c1e409fa031c%22%2C%22%24search_engine%22%3A%20%22google%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fwww.google.com%2F%22%2C%22%24initial_referring_domain%22%3A%20%22www.google.com%22%2C%22Language%22%3A%20%22zh-CN%22%2C%22Platform%22%3A%20%22Web%22%2C%22Backend%20User%20Country%22%3A%20%22CN%22%2C%22Test-WS2199%22%3A%20%22variant%22%2C%22Page%20Type%22%3A%20%22Destination%20Page%22%2C%22__timers%22%3A%20%7B%7D%2C%22Login%20Status%22%3A%20false%2C%22Test-3%22%3A%20%22variant-10%22%2C%22Test-14%22%3A%20%22variant-55%22%2C%22Test-WS2196%22%3A%20%22control%22%2C%22Test-WS2350%22%3A%20%22variant%22%2C%22'Test-BB1%22%3A%20%22control%22%2C%22Test-12%22%3A%20%22control%22%2C%22Test-23%22%3A%20%22variant-70%22%2C%22Test-AAAAA%22%3A%20%22variant%22%2C%22Test-24%22%3A%20%22control%22%2C%22Test-25%22%3A%20%22control%22%2C%22Test-26%22%3A%20%22variant-75%22%2C%22WS-2515%22%3A%20%22WS-2515-variant1%22%2C%22WS-2351%22%3A%20%22WS-2351-variant1%22%7D; wcs_bt=s_2cb388a4aa34:1565935116; _px3=51c25ddccc41460714b0c77f9086094ebca4547fe6aff217bada6cd0a71b9cda:1K7R1eyox+K6FywON0Wjpr/BvHj0YRXaQx9pH45gDDO4QEcYa7eI+hSsgvjvtAdRFfNFo/12w1i3MBbgQsHVhA==:1000:4duS7MSxB3gm7SQSqY7aj6Hnnyzqw2hPcZl8z6X6Ee56B7pT4yuuroAOE6n43zXK+D22dsZWIFh4kp3252pn2sm9khCmkHbsNckMqPyDeKKSVWjo/8QOfv+t2pDd0D6nVliwyxyI5OVY9hhoBdkkKJS41SwORVvfALvpEDnEnBg="
, "user-agent": getheaders()
, "Sec-Fetch-Mode": "cors"
}


def get_proxy():
    return requests.get("http://127.0.0.1:5055/get/").json()


def delete_proxy(proxy):
    requests.get("http://127.0.0.1:5055/delete/?proxy={}".format(proxy))


def getHtml(product_id):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    print(proxy)
    while retry_count > 0:
        try:
            html = requests.get('https://www.klook.com/zh-CN/activity/' + str(product_id), proxies={"http": "http://{}".format(proxy)}, headers=headers)
            print(html)
            # 使用代理访问
            return html.text
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None



def save_to_Mongo(result):
    # 数据储存到mongodb
    try:
        if db[MONGO_TABLE].insert(result):
            print('存储到MongoDB成功', result)
    except Exception:
        print('存储到MongoDb失败', result)


def get_product(product_id):
    print("商品", str(product_id))
    res = getHtml(product_id)
    pattern = re.compile(r'activityInfo.*?"noindex":false')
    result = pattern.findall(res)
    if result == []:
        pattern = re.compile(r'activityInfo.*?"noindex":true')
        result = pattern.findall(res)
    product_data = result[0][15:] + '}'
    product_json = json.loads(product_data)
    product_json['_id'] = product_json.pop("id")
    save_to_Mongo(product_json)


def get_product_id():
    client = pymongo.MongoClient('localhost', 27017)
    db = client['klook']
    table = db['products_id']
    data = pd.DataFrame(list(table.find()))
    id_list = data['_id']
    return id_list


MONGO_URl = 'localhost:27017'
MONGO_DB = 'klook'
client = pymongo.MongoClient(MONGO_URl)
db = client[MONGO_DB]
MONGO_TABLE = 'products'

for i in range(235, len(get_product_id())):
    time.sleep(5)
    get_product(get_product_id()[i])
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 212,383评论 6 493
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 90,522评论 3 385
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 157,852评论 0 348
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 56,621评论 1 284
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 65,741评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 49,929评论 1 290
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,076评论 3 410
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 37,803评论 0 268
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,265评论 1 303
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 36,582评论 2 327
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,716评论 1 341
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,395评论 4 333
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,039评论 3 316
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,798评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,027评论 1 266
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 46,488评论 2 361
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 43,612评论 2 350

推荐阅读更多精彩内容