Python爬虫爬取基金数据

1 程序结构介绍

代码结构图:

代码结构图

代码目录结构:

代码目录结构

结果图:

结果图

2 数据源网站

https://www.dayfund.cn/incrank.html

3 GitHub地址

https://github.com/crazyjums/crawl_fund_data

4 文件代码:

main.py

from get_fund_code import *
from get_fund_data import *
from MysqlDB import MysqlFundCode
import time

def main():
    s_time = time.time()
    print("程序正在运行....")
    mysql_code = MysqlFundCode()
    fund_code_lists = mysql_code.get_code_and_name_and_type("指数型")
    save_to_mysql(fund_code_lists=fund_code_lists)

    e_time = time.time()
    print("一共运行了{}秒".format(e_time-s_time))

if __name__ == '__main__':
    main()

get_fund_data.py

import logging
import sys
from bs4 import BeautifulSoup as bs
from MysqlDB import MysqlFundDetailData
from toolkit import LOG_FORMAT,DATE_FORMAT,get_year_mon_day,get_class_name,get_HTML_content

def get_refer_fund_detail_data(fund_code,start_date="2019-09-28",end_date="2019-12-28"):
    '''
    :param fund_code:
    :param start_date:
    :param end_date:
    :return:
    date,
    fund_name,
    latest_nvalue_pu,
    latest_sum_nvalue,
    last_nvalue_pu,
    last_sum_nvalue,
    daily_growth,
    daily_growth_rate
    '''
    try:
        url = "https://www.dayfund.cn/fundvalue/{}.html?sdate={}&edate={}".format(fund_code, start_date, end_date)
        resp = get_HTML_content(url)
        soup = bs(resp,"lxml")
        trs = soup.find_all("table",attrs={"class":"mt1 clear"})[0]
        # fund_name = soup.find("h1",attrs={"class":"myfundTitle"}).string
        # t = re.sub(r"\(","_",fund_name)
        # fund_name = re.sub(r"\)","",t)
        _soup = bs(str(trs),"lxml")
        lis = _soup.find_all("tr")
        fund_lists = []
        count = 0
        for i in lis:
            fund_dict = {}
            if count == 1:
                count += 1
                pass
            t = list(i)
            if len(t) >= 17:
                date = t[1].string
                latest_nvalue_pu = t[7].string
                latest_sum_nvalue = t[9].string
                last_nvalue_pu = t[11].string
                last_sum_nvalue = t[13].string
                daily_growth = t[15].string
                daily_growth_rate = t[17].string
                fund_dict["date"] = date
                fund_dict["latest_nvalue_pu"] = latest_nvalue_pu
                fund_dict["latest_sum_nvalue"] = latest_sum_nvalue
                fund_dict["last_nvalue_pu"] = last_nvalue_pu
                fund_dict["last_sum_nvalue"] = last_sum_nvalue
                fund_dict["daily_growth"] = daily_growth
                fund_dict["daily_growth_rate"] = daily_growth_rate
                fund_lists.append(fund_dict)
                # logging.info("{} | {} appended into fund_lists".format(date,latest_nvalue_pu))
        return fund_lists[1:]
    except Exception as e:
        logging.error("{} | {}".format(e,sys._getframe().f_code.co_name))


def save_to_mysql(start_time=None,end_time=None, fund_code_lists=[]):
    if start_time == None:
        start_time = get_year_mon_day(y=1)
    if end_time == None:
        end_time = get_year_mon_day()

    if len(fund_code_lists) != 0:
        mysql = MysqlFundDetailData()
        for info in fund_code_lists:
            fund_code = info[0]
            table_name = info[-1]
            fund_lists = get_refer_fund_detail_data(fund_code,start_time,end_time)
            if not mysql.check_table_if_exist(table_name):
                logging.info("{}表没有创建,正在创建... | {}".format(table_name, sys._getframe().f_code.co_name))
                mysql.create_table(table_name)
                logging.info("创建成功!正在将数据写入{}中... | {}".format(table_name,sys._getframe().f_code.co_name))
                for i in fund_lists:
                    mysql.insert_into_table(table_name,i)
                logging.info("{}。写入成功。 | {}".format(table_name,sys._getframe().f_code.co_name))
            else:
                logging.info("表已存在,正在将数据写入{}中... | {}".format(table_name, sys._getframe().f_code.co_name))
                for i in fund_lists:
                    mysql.insert_into_table(table_name, i)
                logging.info("{}。写入成功。 | {}".format(table_name, sys._getframe().f_code.co_name))
                # for i in fund_lists:
                #     mysql.insert_into_table(table_name,i)
                #     print("{},done".format(table_name))

    else:
        logging.info("列表为空,没有爬取到数据。| {}".format(sys._getframe().f_code.co_name))



def get_name_data():
    s = '''
        fund_type = i[2]
            date = i[3]
            nvalue_pu = i[4]
            day_growth_rate = i[5]
            a_week_rate = i[6]
            a_month_rate = i[7]
            _3_month_rate = i[8]
            _6_month_rate = i[9]
            a_year_rate = i[10]
            _2_year_rate = i[11]
            _3_year_rate = i[12]
            from_this_year = i[13]
            from_found_year = i[14]
            poundage = i[-2]
            purchase_money = i[-5]
        '''
    l = s.split("\n")
    for i in l:
        li = i.strip().split("=")
        if len(li) == 2:
            name = li[0].strip()
            data = li[-1].strip()
            print("temp_dict[\"{}\"] = {}".format(name, data))

get_fund_code.py

import demjson,re
import logging,sys
import os,time
from MysqlDB import MysqlFundCode
from toolkit import LOG_FORMAT,DATE_FORMAT,get_year_mon_day,get_class_name,get_HTML_content

def get_fund_code_lists_by_page(page):
    try:
        '''https://fundapi.eastmoney.com/fundtradenew.aspx
        ?ft=zs&sc=1n&st=desc&pi=1&pn=100&cp=&ct=&cd=&ms=&fr=&plevel=&fst=&ftype=&fr1=&fl=0&isab=1
        https://fundapi.eastmoney.com/fundtradenew.aspx?ft=zs&sc=1n&st=desc&pi=3&pn=100&cp=&ct=&cd=&ms=&fr=&plevel=&fst=&ftype=&fr1=&fl=0&isab=1'''
        url = "https://fundapi.eastmoney.com/fundtradenew.aspx?ft=zs&sc=1n&st=desc&pi={}&pn=100&cp=&ct=&cd=&ms=&fr=&plevel=&fst=&ftype=&fr1=&fl=0&isab=1".format(page)
        content = get_HTML_content(url)
        _ = re.sub("\|","  ",content[15:-1])
        d = demjson.decode(_)
        fund_info_lists = []
        for i in d["datas"]:
            i = i.split("  ")
            temp_dict = {}
            temp_dict["fund_code"] = i[0]
            temp_dict["fund_name"] = i[1]
            temp_dict["fund_type"] = i[2]
            temp_dict["date"] = i[3]
            temp_dict["nvalue_pu"] = i[4]
            temp_dict["day_growth_rate"] = i[5]
            temp_dict["a_week_rate"] = i[6]
            temp_dict["a_month_rate"] = i[7]
            temp_dict["_3_month_rate"] = i[8]
            temp_dict["_6_month_rate"] = i[9]
            temp_dict["a_year_rate"] = i[10]
            temp_dict["_2_year_rate"] = i[11]
            temp_dict["_3_year_rate"] = i[12]
            temp_dict["from_this_year"] = i[13]
            temp_dict["from_found_year"] = i[14]
            temp_dict["poundage"] = i[-2]
            temp_dict["purchase_money"] = i[-5]
            fund_info_lists.append(temp_dict)
        logging.info("{} | {}".format("第 {} 页数据抓取完成。".format(page),sys._getframe().f_code.co_name))
        return fund_info_lists
    except Exception as e:
        logging.error("{} | {}".format(e, sys._getframe().f_code.co_name))



def get_total_page_num():
    try:
        url = "https://fundapi.eastmoney.com/fundtradenew.aspx?ft=zs&sc=1n&st=desc&pi=1&pn=100&cp=&ct=&cd=&ms=&fr=&plevel=&fst=&ftype=&fr1=&fl=0&isab=1"
        content = get_HTML_content(url)
        _ = re.sub("\|", "  ", content[15:-1])
        d = demjson.decode(_)
        total_page = (int(d["allPages"]))
        logging.info("{} pages | {}".format(total_page, sys._getframe().f_code.co_name))
        return total_page
    except Exception as e:
        logging.error("{} | {}".format(e, sys._getframe().f_code.co_name))


def get_all_fund_lists():
    all_fund_lists = []
    for page in range(1,get_total_page_num() + 1):
        all_fund_lists.append(get_fund_code_lists_by_page(page))

    return all_fund_lists


def write_all_fund_lists_into_file(filename="all_fund_lists.txt"):
    if os.path.exists(filename):
        with open(filename,"a+",encoding="utf-8") as file:
            logging.info("{} 文件存在,正在追加... | {}".format(filename, sys._getframe().f_code.co_name))
            file.write("\n\n")
            file.write("-"*20 + "这是新加的数据,时间:{}".format(time.ctime()) + "\n\n")
            for fund_list in get_all_fund_lists():
                file.write(str(fund_list))
                file.write("\n")
            file.write("\n" + "-"*20)
    else:
        with open(filename,"w",encoding="utf-8") as file:
            logging.info("{} 文件不存在,正在创建并写数据... | {}".format(filename, sys._getframe().f_code.co_name))
            for fund_list in get_all_fund_lists():
                file.write(str(fund_list))
                file.write("\n")


def get_name_data():
    s = '''fund_dict["date"] = date
            fund_dict["latest_nvalue_pu"] = latest_nvalue_pu
            fund_dict["latest_sum_nvalue"] = latest_sum_nvalue
            fund_dict["last_nvalue_pu"] = last_nvalue_pu
            fund_dict["last_sum_nvalue"] = last_sum_nvalue
            fund_dict["daily_growth"] = daily_growth
            fund_dict["daily_growth_rate"] = daily_growth_rate'''
    li = s.split("\n")
    tt = ""
    data = ""
    values = ""
    import re
    for i in li:
        t = i.split("=")[0].strip()
        t = re.sub("fund_dict\[\"", "", t)
        t = re.sub("\"\]", "", t)
        tt += t + ","
        d = "{}=each_data[\"{}\"],".format(t, t)
        data += d
        v = r"\'{" + t + r"}\'"
        values += v + ","
        sql = r"insert into {table_name} " + "({}) values({})".format(tt,values)
    print(sql)
    print(values)
    print(data)
    print(tt)

def get_sql():
    s = "date,fund_name,latest_nvalue_pu,latest_sum_nvalue,last_nvalue_pu,last_sum_nvalue,daily_growth,daily_growth_rate"
    li = s.split(",")
    s = ""
    for i in li:
        '''`fund_code` VARCHAR(50)  NULL,'''
        t = "`" + i + "`" + "VARCHAR(50)  NULL," + "\n"
        s += t
    print(s)

def save_to_mysql():
    mysql = MysqlFundCode()
    table_name = get_fund_code_lists_by_page(1)[1]["fund_type"]
    all_fund_lists = get_all_fund_lists()
    if not mysql.check_table_if_exist(table_name=table_name):
        logging.info("{}表没有创建,正在创建... | {}".format(table_name, sys._getframe().f_code.co_name))
        mysql.create_table(table_name)
        logging.info("创建成功!正在将数据写入{}中... | {}".format(table_name,sys._getframe().f_code.co_name))
        for fund_list in all_fund_lists:
            for each_data in fund_list:
                # print("now --> {}".format(each_data))
                mysql.insert_into_table(table_name,each_data)
                # logging.info("{} | {}".format(each_data, sys._getframe().f_code.co_name))
        logging.info("{}。写入成功。 | {}".format(table_name, sys._getframe().f_code.co_name))
    else:
        logging.info("表已存在,正在将数据写入{}中... | {}".format(table_name, sys._getframe().f_code.co_name))
        for fund_list in all_fund_lists:
            for each_data in fund_list:
                # print("now --> {}".format(each_data))
                mysql.insert_into_table(table_name,each_data)
        logging.info("{}。写入成功。 | {}".format(table_name, sys._getframe().f_code.co_name))

MysqlDB.py

import pymysql,logging,sys
from toolkit import LOG_FORMAT,DATE_FORMAT,get_year_mon_day,get_class_name


class MysqlFundCode():
    def __init__(self):
        self.host = "127.0.0.1"
        self.user = "root"
        self.password = "root"
        self.database = "fund_data"
        self.charset = "utf8mb4"
        self.port = 3306
        self.count = 0

    def DB(self):
        return pymysql.connect(self.host,self.user, self.password, self.database, self.port, charset=self.charset)

    def insert_into_table(self, table_name,each_data):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        # if not isinstance(each_data,list):
        #     each_data = eval(each_data)
        sql = '''
            insert into {table_name}(fund_code,fund_name,fund_type,date,nvalue_pu,day_growth_rate,
            a_week_rate,a_month_rate,_3_month_rate,_6_month_rate,a_year_rate,_2_year_rate,_3_year_rate,
            from_this_year,from_found_year,poundage,purchase_money)
            values(\'{fund_code}\',\'{fund_name}\',\'{fund_type}\',\'{date}\',\'{nvalue_pu}\',
            \'{day_growth_rate}\',\'{a_week_rate}\',\'{a_month_rate}\',\'{_3_month_rate}\',
            \'{_6_month_rate}\',\'{a_year_rate}\',\'{_2_year_rate}\',\'{_3_year_rate}\',
            \'{from_this_year}\',\'{from_found_year}\',\'{poundage}\',\'{purchase_money}\')
        '''.format(table_name=table_name,fund_code=each_data["fund_code"],fund_name=each_data["fund_name"],
                   fund_type=each_data["fund_type"],date=each_data["date"],nvalue_pu=each_data["nvalue_pu"],
                   day_growth_rate=each_data["day_growth_rate"],a_week_rate=each_data["a_week_rate"],
                   a_month_rate=each_data["a_month_rate"],_3_month_rate=each_data["_3_month_rate"],_6_month_rate=each_data["_6_month_rate"],
                   a_year_rate=each_data["a_year_rate"],_2_year_rate=each_data["_2_year_rate"],_3_year_rate=each_data["_3_year_rate"],
                   from_this_year=each_data["from_this_year"],from_found_year=each_data["from_found_year"],poundage=each_data["poundage"],
                   purchase_money=each_data["purchase_money"])
        # print(sql)
        try:
            with mysqlDB.cursor() as cursor:
                info = cursor.execute(sql)
                mysqlDB.commit()
                if cursor.rowcount >= 1:
                    self.count += 1
                else:
                    pass
        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
        finally:
            mysqlDB.close()

    def create_table(self,table_name):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        '''
        date,fund_name,latest_nvalue_pu,latest_sum_nvalue,last_nvalue_pu,
    last_sum_nvalue,daily_growth,daily_growth_rate
        '''
        '''
        temp_dict["fund_code"] = i[0]
        temp_dict["fund_name"] = i[1]
        temp_dict["fund_type"] = i[2]
        temp_dict["date"] = i[3]
        temp_dict["nvalue_pu"] = i[4]
        temp_dict["day_growth_rate"] = i[5]
        temp_dict["a_week_rate"] = i[6]
        temp_dict["a_month_rate"] = i[7]
        temp_dict["_3_month_rate"] = i[8]
        temp_dict["_6_month_rate"] = i[9]
        temp_dict["a_year_rate"] = i[10]
        temp_dict["_2_year_rate"] = i[11]
        temp_dict["_3_year_rate"] = i[12]
        temp_dict["from_this_year"] = i[13]
        temp_dict["from_found_year"] = i[14]
        temp_dict["poundage"] = i[-2]
        temp_dict["purchase_money"] = i[-5]
        '''
        sql = '''
            CREATE TABLE IF NOT EXISTS `{table_name}`(
               `id`  bigint NOT NULL AUTO_INCREMENT ,
               `fund_code` VARCHAR(40)  NULL,
               `fund_name` VARCHAR(100)  NULL,
               `fund_type` VARCHAR(40) NULL,
               `date` VARCHAR(40) NULL,
               `nvalue_pu`  VARCHAR(40) NULL,
               `day_growth_rate` VARCHAR(40) NULL,
               `a_week_rate` VARCHAR(40) NULL,
               `a_month_rate` VARCHAR(40) NULL,
               `_3_month_rate` VARCHAR(40) NULL,
               `_6_month_rate` VARCHAR(40) NULL,
               `a_year_rate` VARCHAR(40) NULL,
               `_2_year_rate` VARCHAR(40) NULL,
               `_3_year_rate` VARCHAR(40) NULL,
               `from_this_year` VARCHAR(40) NULL,
               `from_found_year` VARCHAR(40) NULL,
               `poundage` VARCHAR(40)  NULL,
               `purchase_money` VARCHAR(40)  NULL,
               PRIMARY KEY ( `id` )
            )ENGINE=InnoDB DEFAULT CHARSET=utf8;
        '''.format(table_name=table_name)
        try:
            with mysqlDB.cursor() as cursor:
                cursor.execute(sql)
                return True
        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
            return False
        finally:
            mysqlDB.close()

    def check_table_if_exist(self,table_name):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB,get_class_name(self),sys._getframe().f_code.co_name))
        sql = "show tables"
        try:
            with mysqlDB.cursor() as cursor:
                cursor.execute(sql)
                _tables = cursor.fetchall()
                table_lists = []
                for i in _tables:
                    table_lists.append(i[0])
                # print("all tables:{}".format(len(table_lists)))
                for _ in table_lists:
                    if table_name in _:
                        return True
                return False
        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
        finally:
            mysqlDB.close()

    def get_code_and_name_and_type(self,table_name):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        sql = "SELECT fund_code,fund_name,fund_type FROM {}".format(table_name)
        try:
            with mysqlDB.cursor() as cursor:
                cursor.execute(sql)
                info = cursor.fetchall()
                '''
                info是一个<class 'tuple'>类型的数据
                '''
                return_info = []
                for each in info:
                    _ = []
                    fund_code = each[0]
                    _table_name = "{}_{}_{}".format(each[0],each[1],each[2])
                    _.append(fund_code)
                    _.append(_table_name)
                    return_info.append(_)
                return return_info

        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
        finally:
            mysqlDB.close()

    def show_data_rows(self):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        sql_1 = "show tables"
        total_count = 0
        try:
            with mysqlDB.cursor() as cursor:
                cursor.execute(sql_1)
                _tables = cursor.fetchall()
                table_lists = []
                for i in _tables:
                    table_lists.append(i[0])
                for i in table_lists:
                    sql_2 = "select count(*) from {}".format(i)
                    cursor.execute(sql_2)
                    res = cursor.fetchall()
                    num = res[0][0]
                    total_count = total_count + num

                print("_"*20)
                print("from now on,there are {} lines data in database.".format(self.good_to_show(total_count)))
                print("_" * 20)

        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
        finally:
            mysqlDB.close()

    def show_insert_rows(self):

        print("_" * 20)
        print("there total insert {} lines data in database.".format(self.good_to_show(self.count)))
        print("_" * 20)

    @staticmethod
    def good_to_show(num):
        _s = str(num)
        if len(_s) == 5:
            head = _s[0]
            tail = _s[1]
            total = head + "." + tail + "万"
            return total
        elif len(_s) == 6:
            head = _s[0:2]
            tail = _s[2]
            total = head + "." + tail + "万"
            return total
        elif len(_s) == 7:
            head = _s[0]
            tail = _s[1]
            total = head + "." + tail + "百万"
            return total
        elif len(_s) == 8:
            head = _s[0]
            tail = _s[1]
            total = head + "." + tail + "千万"
            return total
        elif len(_s) == 9:
            head = _s[0]
            tail = _s[1]
            total = head + "." + tail + "亿"
            return total
        elif len(_s) > 9:
            head = _s[0:-8]
            tail = _s[1]
            total = head + "." + tail + "亿"
            return total
        else:
            return str(num)



class MysqlFundDetailData():
    def __init__(self):
        self.host = "127.0.0.1"
        self.user = "root"
        self.password = "root"
        self.database = "fund_data"
        self.charset = "utf8mb4"
        self.port = 3306
        self.count = 0

    def DB(self):
        return pymysql.connect(self.host,self.user, self.password, self.database, self.port, charset=self.charset)

    def insert_into_table(self, table_name,each_data):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        # if not isinstance(each_data,list):
        #     each_data = eval(each_data)
        sql = '''insert into {table_name}(date, latest_nvalue_pu, latest_sum_nvalue, last_nvalue_pu, last_sum_nvalue,
        daily_growth,daily_growth_rate) 
        values(\'{date}\',\'{latest_nvalue_pu}\',\'{latest_sum_nvalue}\',\'{last_nvalue_pu}\',
        \'{last_sum_nvalue}\',\'{daily_growth}\',\'{daily_growth_rate}\')
        '''.format(table_name=table_name, date=each_data["date"], latest_nvalue_pu=each_data["latest_nvalue_pu"],
           latest_sum_nvalue=each_data["latest_sum_nvalue"], last_nvalue_pu=each_data["last_nvalue_pu"],
           last_sum_nvalue=each_data["last_sum_nvalue"], daily_growth=each_data["daily_growth"],
           daily_growth_rate=each_data["daily_growth_rate"])

        # print(sql)
        try:
            with mysqlDB.cursor() as cursor:
                info = cursor.execute(sql)
                mysqlDB.commit()
                if cursor.rowcount >= 1:
                    self.count += 1
                else:
                    pass
        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
        finally:
            mysqlDB.close()

    def create_table(self,table_name):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        '''
        date,fund_name,latest_nvalue_pu,latest_sum_nvalue,last_nvalue_pu,
    last_sum_nvalue,daily_growth,daily_growth_rate
        '''
        '''
        temp_dict["fund_code"] = i[0]
        temp_dict["fund_name"] = i[1]
        temp_dict["fund_type"] = i[2]
        temp_dict["date"] = i[3]
        temp_dict["nvalue_pu"] = i[4]
        temp_dict["day_growth_rate"] = i[5]
        temp_dict["a_week_rate"] = i[6]
        temp_dict["a_month_rate"] = i[7]
        temp_dict["_3_month_rate"] = i[8]
        temp_dict["_6_month_rate"] = i[9]
        temp_dict["a_year_rate"] = i[10]
        temp_dict["_2_year_rate"] = i[11]
        temp_dict["_3_year_rate"] = i[12]
        temp_dict["from_this_year"] = i[13]
        temp_dict["from_found_year"] = i[14]
        temp_dict["poundage"] = i[-2]
        temp_dict["purchase_money"] = i[-5]
        '''
        sql = '''
            CREATE TABLE IF NOT EXISTS `{table_name}`(
               `id`  bigint NOT NULL AUTO_INCREMENT ,
               `date`VARCHAR(50)  NULL,
                `latest_nvalue_pu`VARCHAR(50)  NULL,
                `latest_sum_nvalue`VARCHAR(50)  NULL,
                `last_nvalue_pu`VARCHAR(50)  NULL,
                `last_sum_nvalue`VARCHAR(50)  NULL,
                `daily_growth`VARCHAR(50)  NULL,
                `daily_growth_rate`VARCHAR(50)  NULL,
               PRIMARY KEY ( `id` )
            )ENGINE=InnoDB DEFAULT CHARSET=utf8;
        '''.format(table_name=table_name)
        try:
            with mysqlDB.cursor() as cursor:
                cursor.execute(sql)
                return True
        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
            return False
        finally:
            mysqlDB.close()

    def check_table_if_exist(self,table_name):
        mysqlDB = self.DB()
        # logging.info("{} | {} | {}".format(mysqlDB, get_class_name(self), sys._getframe().f_code.co_name))
        sql = "show tables"
        try:
            with mysqlDB.cursor() as cursor:
                cursor.execute(sql)
                _tables = cursor.fetchall()
                table_lists = []
                for i in _tables:
                    table_lists.append(i[0])
                # print("all tables:{}".format(len(table_lists)))
                for _ in table_lists:
                    if table_name in _:
                        return True
                return False
        except Exception as e:
            logging.error("{} | {} | {}".format(e, get_class_name(self), sys._getframe().f_code.co_name))
        finally:
            mysqlDB.close()

toolkit.py

import time,sys
import requests
from random import randint
import logging

LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"



def get_year_mon_day(y = 0,m = 0,d = 0):
    t = time.localtime()
    year = t.tm_year
    month = t.tm_mon
    day = t.tm_mday
    _time = "{}-{}-{}".format(int(year) - y, month - m, day - d)
    return _time

filename = "log/mylog_{}.log".format(get_year_mon_day())
logging.basicConfig(filename=filename, level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)


def get_class_name(self):
    return self.__class__.__name__



def getUser_Agent():
    headers = [
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',   #safari 5.1 – MAC
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',  #safari 5.1 – Windows
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',   #IE 9.0
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',   #IE 8.0
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',   #IE 7.0
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',   # IE6.0
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',   #Firefox 4.0.1 – MAC
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',     #Firefox 4.0.1 – Windows
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',     #Opera 11.11 – MAC
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',  #Opera 11.11 – Windows
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',    #Chrome 17.0 – MAC
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',  #傲游(Maxthon)
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',  #腾讯TT
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',  #世界之窗(The World) 2.x
        'ozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',  #世界之窗(The World) 3.x
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',  #搜狗浏览器 1.x
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',  #360浏览器
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',     #Avant
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',     #Green Browser
        ##移动端用户代理
        'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',     #safari iOS 4.33 – iPhone
        'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',     #safari iOS 4.33 – iPod Touch
        'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',     #safari iOS 4.33 – iPad
        'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',  #Android N1
        'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',  #Android QQ浏览器 For android
        'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',  #Android Opera Mobile
        'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+',  #Android Pad Moto Xoom
        'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0',  #WebOS HP Touchpad
        'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',  #Nokia N97
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)',  #Windows Phone Mango
        'UCWEB7.0.2.37/28/999',  #UC无
        'Openwave/ UCWEB7.0.2.37/28/999',  #UCOpenwave
        'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999',  #UC Opera
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",

    ]
    return {'User-Agent':headers[randint(0,len(headers)-1)]}




headers = getUser_Agent()

def get_HTML_content(url):
    resp = requests.get(url = url, headers=headers).content.decode("utf-8")
    return resp
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 215,539评论 6 497
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 91,911评论 3 391
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 161,337评论 0 351
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 57,723评论 1 290
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 66,795评论 6 388
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,762评论 1 294
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,742评论 3 416
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,508评论 0 271
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,954评论 1 308
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,247评论 2 331
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,404评论 1 345
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,104评论 5 340
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,736评论 3 324
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,352评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,557评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,371评论 2 368
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,292评论 2 352

推荐阅读更多精彩内容

  • 之前就爬过拉勾网,但是遇到一些错误一直没有办法解决,果断放弃了,今天又重新试着写写看,对于一个菜鸟来说,真的都是处...
    小小佐阅读 1,367评论 0 0
  • 前言 爬虫就是请求网站并提取数据的自动化程序,其中请求,提取,自动化是爬虫的关键。Python作为一款出色的胶水语...
    王奥OX阅读 3,358评论 1 8
  • 这个春天真是赏了不少的花。 朋友说我成天和花花约会,我说不能辜负,要和春花谈场恋爱。 朋友...
    暮雨潇潇X阅读 206评论 0 1
  • 说到 香园 其实我不是第一次接触它了 但每次来到香园 就有种新的看法 在我映像中香园一直在不断完善 给所有慕名前来...
    甲雍幸饶阅读 1,070评论 0 0
  • 野花的香 蜜蜂喜嗅 人不曾闻到 野草的碧 大地欣赏 人不曾观看 花草虽小 亦在争天命 不为被注意 只为活自己
    狂野云豹阅读 239评论 -1 2