代码保存

-- coding: utf-8 --

"""
Created on Thu Jan 9 11:36:00 2020

@author: QIAOQICHAO258
"""

import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import codecs
import csv
import os
from math import radians, cos, sin, asin, sqrt
import math
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import mean_absolute_error

cols_dict =
{'bazhuayu_case.case_type': '案例类型',
'bazhuayu_case.case_source': '案例来源',
'bazhuayu_case.district': '区域',
'bazhuayu_case.community_sources': '案例源小区名称',
'bazhuayu_case.flr_total_ind': '总楼层',
'bazhuayu_case.checked_time': '看房次数',
'bazhuayu_case.attention_time': '关注量',
'bazhuayu_case.browse_time': '浏览次数',
'bazhuayu_case.list_time': '挂牌时间',
'bazhuayu_case.transaction_time': '成交时间',
'bazhuayu_case.list_totalprice': '挂牌总价(万元)',
'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',
'bazhuayu_case.transaction_price': '成交总价(万元)',
'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',
'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',
'bazhuayu_case.transaction_cycle': '成交周期',
'bazhuayu_case.price_adjustment_times': '调价次数',
'bazhuayu_case.longitude': '经度',
'bazhuayu_case.latitude': '纬度'}

pi = math.pi
EARTH_REDIUS = 6378.137

经度系数 1米所对应的经度

LONGITUDE_COEF = 0.000011

纬度系数 1米所对应的度度

LATITUDE_COEF = 0.000009

###################################### 读取数据 ###################################################

参数配置

set_parm = pd.read_excel('小区画像配置/小区画像配置.xlsx')

城市

city = set_parm['xh_city'][0]

city = '深圳市'

poi数据文件

path_poi = '高德POI/%s.csv' %city
with open(path_poi, 'r', encoding='utf-8')as f:
poi = pd.read_csv(f)

小区清单数据文件

path_community = '小区清单/八爪鱼小区清单20191230.csv'
community = pd.read_csv(path_community, engine='python',encoding='utf-8')
community = community[community['城市'] == city]

百强开发商物业

developer = pd.read_excel('百强物业开发商/百强开发商.xlsx')
property = pd.read_excel('百强物业开发商/百强物业.xlsx')

映射表

yingshe = pd.read_excel('映射表/%s.xlsx'%city)

八爪鱼挂牌案例

txt_name = os.listdir('城市八爪鱼数据/%s' % city)[0]
with open('城市八爪鱼数据/%s/%s' % (city, txt_name),'r',encoding='utf-8')as f:
case_data = pd.read_csv(f, sep='$', error_bad_lines=False,
usecols=cols_dict.keys(), low_memory=True)

法拍数据

fapai = pd.read_csv('法拍数据/北京重庆大连哈尔滨_京东阿里法拍.csv', engine='python', encoding='utf8')

####################################################################################################
def mape(estimatory, X, y):
y_ = estimatory.predict(X)
return (np.abs(y - y_)/y).mean()

def rad(d):
return d * pi / 180.0

测算两点经纬度之间的距离

def getDistance1(lat1, lng1, lat2, lng2):
radLat1 = rad(lat1)
radLat2 = rad(lat2)
a = radLat1 - radLat2
b = rad(lng1) - rad(lng2)
s = 2 * math.asin(math.sqrt(math.pow(math.sin(a/2), 2) + math.cos(radLat1) * math.cos(radLat2) * math.pow(math.sin(b/2), 2)))
s = s * EARTH_REDIUS
return s * 1000

def getDistance(lon1, lat1, lon2, lat2):
#计算距离

# 将十进制度数转化为弧度 
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 
# haversine公式 
dlon = lon2 - lon1 
dlat = lat2 - lat1 
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a)) 
r = 6378.137 # 地球平均半径,单位为公里 
return c * r * 1000

def select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, len_coef):

poi_data_spilt = poi_data[ (temp_comm_longitude + LONGITUDE_COEF * len_coef > poi_data['longitude'] ) & 
                           (poi_data['longitude'] > temp_comm_longitude - LONGITUDE_COEF * len_coef) &
                           (temp_comm_latitude + LONGITUDE_COEF * len_coef > poi_data['latitude'] ) &
                           (poi_data['latitude'] > temp_comm_latitude - LONGITUDE_COEF * len_coef) ]
return poi_data_spilt

体育馆最近距离

def get_turth_distance(poi_data_all, read_community_data):
dict_temp = {
'体育馆':["['综合体育馆']"],
'幼儿园':["['幼儿园']"],
'小学':["['小学']","['中学', '小学']", "['小学', '中学']"],
'购物中心':["['购物中心']"],
'三级甲等医院':["['三级甲等医院']"],
'政府机关':["['区县级政府及事业单位']","['省直辖市级政府及事业单位']"],
'火车站':["['火车站']"],
'景区':["['国家级景点']", "['省级景点']"],
'公园':["['公园']"],
'高等院校':["['高等院校']"],
'地铁站': ["['地铁站']"],
'飞机场':["['飞机场']"],
'城市中心':["['城市中心']"],
'城市广场':["['城市广场']"],
}
data1=dict_temp.items()
community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values
list_distance_min=[]
for key,value in data1:
poi_data=poi_data_all.copy()
if key=='政府机关':
poi_data = poi_data[((poi_data['sub_category']=="['区县级政府及事业单位']")|(poi_data['sub_category']=="['省直辖市级政府及事业单位']"))&(poi_data['name'].str.endswith('人民政府')) ]
elif key=='小学':
poi_data=poi_data[((poi_data['sub_category']=="['小学']")|(poi_data['sub_category']=="['中学', '小学']")|(poi_data['sub_category']=="['小学', '中学']"))]
elif key=='景区':
poi_data=poi_data[((poi_data['sub_category']=="['国家级景点']")|(poi_data['sub_category']=="['省级景点']"))]
else:
print(value)
poi_data =poi_data[(poi_data['sub_category']==value[0])]
# 小区清单列表
list_distance_min=[]
print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))

    # 遍历小区清单
    for temp_comm in tqdm(community_data):        
        # 经度
        temp_comm_longitude = float(temp_comm[2]) 
        # 纬度
        temp_comm_latitude = float(temp_comm[3]) 
        list_distance_temp=[]
       # 过滤经纬度       
        temp_poi_data = poi_data[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values

        for temp_poi in temp_poi_data:
            
            # 经度
            temp_poi_longitude = float(temp_poi[2])
            # 纬度
            temp_poi_latitude = float(temp_poi[3])

            # 距离
            distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
            list_distance_temp.append(distance)
        distance_min=min(list_distance_temp)if len(list_distance_temp)!=0 else 0
        list_distance_min.append(distance_min)
    read_community_data['%s'%(key)]=list_distance_min
return read_community_data

购物中心标签

def get_turth_number(poi_data_all, read_community_data):
dict_temp = {

        '商务写字楼':["['商务写字楼']"],
        '丧葬设施':["['丧葬设施']"],
        '商场':["['商场']"],
        '路口名':["['路口名']"],
        '公交车站相关':["['公交车站相关']"],
        '工厂':["['工厂']"],
}
data1=dict_temp.items()
community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values
for key,value in data1:
    poi_data=poi_data_all.copy()
    print(value)
    poi_data =poi_data[(poi_data['sub_category']==value[0])]
# 小区清单列表
    list_number=[] 
    print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))
    
    # 遍历小区清单
    for temp_comm in tqdm(community_data):        
        # 经度
        temp_comm_longitude = float(temp_comm[2]) 
        # 纬度
        temp_comm_latitude = float(temp_comm[3]) 
        list_number_temp=[]
        # 过滤经纬度
        poi_data_spilt = select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, 2000)
        temp_poi_data = poi_data_spilt[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values

        for temp_poi in temp_poi_data:
            
            # 经度
            temp_poi_longitude = float(temp_poi[2])
            # 纬度
            temp_poi_latitude = float(temp_poi[3])

            # 距离
            distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
            if distance<=1000:
                list_number_temp.append(distance)
        number=len(list_number_temp)
        list_number.append(number)
    read_community_data['%s'%(key)]=list_number
return read_community_data

class CaseValue():

def __init__(self, city_name):
    self.city_name = city_name
    self.df = self.read_and_process_data()
    self.data = self.get_info(self.df)
    self.result = self.match_c()
    self.result = self.fill_nan(self.result)

def read_and_process_data(self):
    year, month = 2019, 1
    cols_dict = \
        {'bazhuayu_case.case_type': '案例类型',
         'bazhuayu_case.case_source': '案例来源',
         'bazhuayu_case.district': '区域',
         'bazhuayu_case.community_sources': '案例源小区名称',
         'bazhuayu_case.flr_total_ind': '总楼层',
         'bazhuayu_case.checked_time': '看房次数',
         'bazhuayu_case.attention_time': '关注量',
         'bazhuayu_case.browse_time': '浏览次数',
         'bazhuayu_case.list_time': '挂牌时间',
         'bazhuayu_case.transaction_time':'成交时间',
         'bazhuayu_case.list_totalprice': '挂牌总价(万元)',
         'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',
         'bazhuayu_case.transaction_price': '成交总价(万元)',
         'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',
         'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',
         'bazhuayu_case.transaction_cycle': '成交周期',
         'bazhuayu_case.price_adjustment_times': '调价次数',
         'bazhuayu_case.longitude': '经度',
         'bazhuayu_case.latitude': '纬度'}

    # TODO: 读取数据
    df = case_data.copy()
    df.columns = df.columns.map(cols_dict)

    # 逗号过滤
    df['案例源小区名称'] = df['案例源小区名称'].astype(str)
    df['案例源小区名称'] = df['案例源小区名称'].str.replace(',', '')

    # TODO: 八爪鱼小区名称映射
    df_community = yingshe
    df_community['映射字段'] = df_community['district'] + df_community['community_sources']

    d = dict(zip(df_community['映射字段'], df_community['man_community_id']))
    df['映射字段'] = df['区域'] + df['案例源小区名称']
    df['匹配ID'] = df['映射字段'].map(d)
    n = df.shape[0]
    n_nan = df[df['匹配ID'].isna()].shape[0]
    print('总数据量:%s, 可映射数据%s, 不可映射数据%s'%(n, n-n_nan, n_nan))
    df = df[df['匹配ID'].notna()]

    def func(x):
        if pd.isna(x['挂牌时间']):
            if pd.isna(x['成交时间']):
                return np.nan
            else:
                return x['成交时间']
        else:
            return x['挂牌时间']

    def func2(x):
        if pd.isna(x['成交单价(元/㎡)']) :
            return x['挂牌单价(元/㎡)'] * 0.95
        else:
            return x['成交单价(元/㎡)']

    df['时间'] = df.apply(func, axis=1)
    df['时间'] = df['时间'].map(lambda x: np.nan if len(str(x)) > 10 else x)
    df['时间']= pd.to_datetime(df['时间'], errors='coerce')
    df = df[df['时间'].notna()]
    df = df[df['时间']>pd.to_datetime('%s-%s-01'%(year, month))]
    df['时间'] = df['时间'].map(lambda x: (x.year, x.month))

    cols = ['案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',
            '成交单价(元/㎡)', '总楼层']
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['价格'] = df[['挂牌单价(元/㎡)', '成交单价(元/㎡)']].apply(func2, axis=1)
    df = df[['匹配ID', '案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',
            '成交单价(元/㎡)', '总楼层','时间','价格']]
    df.drop_duplicates(subset=['匹配ID', '价格'], inplace=True)
    return df

def get_info(self, df):
    data = []
    for c_id, group in tqdm(df.groupby(['匹配ID']), desc='挂牌数据计算中'):

        n_chengjiao = group[group['成交单价(元/㎡)'].notna()].shape[0]
        n_guapai = group[group['挂牌单价(元/㎡)'].notna()].shape[0]

        n_floor = group['总楼层'].median()
        avg_price = group['价格'].median()
        case_price = group['案例源小区均价(元/㎡)'].median()
        var_price = group['案例源小区均价(元/㎡)'].var()
        if pd.isna(case_price):
            price = avg_price
            if price < 1000:
                price = np.nan
        else:
            price = case_price
            if price < 1000:
                price = np.nan

        a = group.groupby('时间').mean()
        rate = (a['价格'].diff() / a['价格']).mean()
        data.append([c_id, n_chengjiao, n_guapai, n_floor, price, rate, var_price])
    data = pd.DataFrame(data, columns=['楼盘ID','成交数量','挂牌数量','总楼层','小区均价','抗跌率', '价格方差'])
    return data

def match_c(self):
    # TODO: 读取小区清单
    result = pd.merge(community[['楼盘ID']], self.data,
                     on='楼盘ID', how='left')
    return result

def fill_nan(self, result):
    result['成交数量'].fillna(0, inplace=True)
    result['挂牌数量'].fillna(0, inplace=True)
    result['总楼层'].fillna(result['总楼层'].quantile(0.4), inplace=True)
    result['抗跌率'].fillna(result['抗跌率'].quantile(0.4), inplace=True)
    return result

处理填充

def community_data(df):
# 填充房屋类型的空值
housing_type_fill=str(df['房屋类型(小类)'].mode())
df.fillna({'房屋类型(小类)':housing_type_fill})
# 填充绿化率和容积率的空值,根据房屋类型分组填充
cols=[col for col in df.columns if col in ['绿化率','容积率'] ]
gp_col='房屋类型(小类)'
df_na=df[cols].isna()
df_mean=df.groupby(gp_col)[cols].mean()
for col in cols:
na_series=df_na[col]
names=list(df.loc[na_series,gp_col])
t=df_mean.loc[names,col]
t.index=df.loc[na_series,col].index
df.loc[na_series,col]=t

# 填充其他字段的空值
df['占地面积'] = pd.to_numeric(df['占地面积'], errors='coerce')
df['地上车位'] = pd.to_numeric(df['地上车位'], errors='coerce')
df['地下车位'] = pd.to_numeric(df['地下车位'], errors='coerce')
build_date_fill=int(df['建成年份'].mode())
green_rate_fill=df['绿化率'].mean()
plot_rate_fill=df['容积率'].mean()
floor_area_fill=df['占地面积'].mean()
house_num_fill=df['总户数'].mean()
manage_type_fill=str(df['管理形式'].mode())

column_list=df.fillna({'管理形式':manage_type_fill,'占地面积':floor_area_fill,'总户数':house_num_fill,'绿化率':green_rate_fill,
                       '容积率':plot_rate_fill,'建成年份':build_date_fill,'地上车位':0,'地下车位':0})

result=column_list[['楼盘ID','楼盘名称','行政区','绿化率','容积率','管理形式','地上车位','地下车位','占地面积','建成年份','总户数']]

column_list['车位比']=column_list.apply(lambda x:(x['地上车位']+x['地下车位'])/x['总户数'],axis=1)
return column_list

def value2score(data):
# 1:值越小分数越高 0:值越大分数越高
dict_parm = {}
[dict_parm.update({parm:'宜居程度'}) for parm in set_parm['xh_livable'].dropna().values]
[dict_parm.update({parm:'生活配套'}) for parm in set_parm['xh_life'].dropna().values]
[dict_parm.update({parm:'区位状况'}) for parm in set_parm['xh_location'].dropna().values]
[dict_parm.update({parm:'楼盘品质'}) for parm in set_parm['xh_house'].dropna().values]
[dict_parm.update({parm:'活跃程度'}) for parm in set_parm['xh_active'].dropna().values]
[dict_parm.update({parm:'不利因素'}) for parm in set_parm['xh_disadvantage'].dropna().values]

usecols = {
    '地铁站': (1, dict_parm['地铁站']),
    '建成年份': (0, dict_parm['建成年份']),
    '绿化率': (0, dict_parm['绿化率']),
    '容积率': (0, dict_parm['容积率']),
    '体育馆': (1, dict_parm['体育馆']),
    '幼儿园': (1, dict_parm['幼儿园']),
    '总户数': (0, dict_parm['总户数']),

'占地面积': (0, '楼盘品质'),

    '小学': (1, dict_parm['小学']),
    '购物中心': (1, dict_parm['购物中心']),
    '三级甲等医院': (1, dict_parm['三级甲等医院']),
    '政府机关': (0, dict_parm['政府机关']),
    '景区': (1, dict_parm['景区']),
    '公园': (1, dict_parm['公园']),
    '高等院校': (1, dict_parm['高等院校']),

'城市中心': (1, '区位状况'),

    '城市广场': (0, dict_parm['城市广场']),
    '商务写字楼': (0,dict_parm['商务写字楼']),
    '丧葬设施': (1, dict_parm['丧葬设施']),
    '商场': (0, dict_parm['商场']),
    '路口名': (0, dict_parm['路口名']),
    '公交车站相关': (0, dict_parm['公交车站相关']),
    '工厂': (0, dict_parm['工厂']),
    '成交数量': (0, dict_parm['成交数量']),
    '挂牌数量': (0, dict_parm['挂牌数量']),

'总楼层': (0, dict_parm['总楼层']),

'抗跌率': (0, '活跃程度'),

    '车位比': (0,dict_parm['车位比']),
    '加油站': (0, dict_parm['加油站']),
    '火车站': (0,dict_parm['火车站']),
    '机场': (0, dict_parm['机场']),
    '行政区分数':(0, dict_parm['行政区分数']),

}
data['建成年份'] = pd.to_numeric(data['建成年份'],errors='coerce')
data['建成年份'].fillna(data['建成年份'].quantile(0.5),inplace=True)
# 处理行政区分数
distruct_value = data[['小区均价','行政区']].groupby('行政区').mean().fillna(0)
dv_dict = dict(zip(distruct_value.index, distruct_value['小区均价']))
data['行政区分数'] = data['行政区'].map(dv_dict)
for col_name, col in data.iteritems():
    if col_name in usecols.keys():

        cat = usecols[col_name][1]
        false = False
        if false:
            pass
        elif false:
            pass
        else:
            t = usecols[col_name][0]
            data.loc[data[col_name] >= data[col_name].quantile(0.75), col_name + '分数'] = t*-4 + 5
            data.loc[(data[col_name] < data[col_name].quantile(0.75)) & (
                    data[col_name] >= data[col_name].quantile(0.50)), col_name + '分数'] = t*-2 + 4
            data.loc[(data[col_name] < data[col_name].quantile(0.50)) & (
                    data[col_name] >= data[col_name].quantile(0.30)), col_name + '分数'] = 3
            data.loc[(data[col_name] < data[col_name].quantile(0.30)) & (
                    data[col_name] >= data[col_name].quantile(0.15)), col_name + '分数'] = t*2 + 2
            data.loc[data[col_name] < data[col_name].quantile(0.15), col_name + '分数'] = t*4 + 1

        if cat in data.columns:
            data[cat] = pd.to_numeric(data[cat], errors='coerce')
            data[cat] += data[col_name + '分数']
        else:
            data[cat] = data[col_name + '分数']
return data

def get_label(data):

developers = developer['公司名称'].values
propertys = property['公司名称'].values

data_temp = data
# 景点
xh_viewpoint = set_parm['xh_viewpoint'][0]
# 公园
xh_park = set_parm['xh_park'][0]
# 医院
xh_hospital = set_parm['xh_hospital'][0]
# 商务区
xh_business = set_parm['xh_business'][0]
# 地铁站
xh_subway = set_parm['xh_subway'][0]
# 政府机关
xh_government = set_parm['xh_government'][0]
# 购物中心
xh_shopping = set_parm['xh_shopping'][0]
# 总户数
xh_number = set_parm['xh_number'][0]
# 车位
xh_car_first = set_parm['xh_car'][0]
xh_car_second = set_parm['xh_car'][1]
# 容积率
xh_plot_first = set_parm['xh_plot'][0]
xh_plot_second = set_parm['xh_plot'][1]
# 绿化率
xh_green_first = set_parm['xh_green'][0]
xh_green_second = set_parm['xh_green'][1]

# 火车站
xh_trian = set_parm['xh_trian'][0]
# 机场
xh_airport = set_parm['xh_airport'][0]
# 丧葬设施
xh_funeal = set_parm['xh_funeal'][0]
# 工厂
xh_factory = set_parm['xh_factory'][0]

# 法拍
xh_vendue_first = set_parm['xh_vendue'][0]
xh_vendue_second = set_parm['xh_vendue'][1]

# 输出标签设置
show_cols = list(set_parm['xh_label'].values)

dffb = fapai
data_temp['法拍数量'] = [len(dffb[dffb['匹配ID'] == ID].drop_duplicates(subset=['匹配ID', 'title'])) for ID in data_temp['楼盘ID'].values]

data_temp.loc[(data['法拍数量'] > 0) & (data_temp['法拍数量'] <= xh_vendue_first), '法拍标签'] = '含法拍'
data_temp.loc[(data['法拍数量'] > xh_vendue_first) & (data_temp['法拍数量'] < xh_vendue_second), '法拍标签'] = '法拍数量多'
data_temp.loc[data['法拍数量'] >= xh_vendue_second, '法拍标签'] = '法拍数量极多'

data_temp['容积率'] = pd.to_numeric(data_temp['容积率'], errors='coerce')
data_temp.loc[data_temp['容积率'] <= xh_plot_first, '容积率标签'] = '容积率低'
data_temp.loc[(data_temp['容积率'] > xh_plot_first ) & (data_temp['容积率'] < xh_plot_second ), '容积率标签'] = '容积率中'
data_temp.loc[data_temp['容积率'] >= xh_plot_second, '容积率标签'] = '容积率高'
data_temp.loc[data_temp['容积率'] >= xh_plot_second, '容积率标签'] = '容积率高'

data_temp['绿化率'] = pd.to_numeric(data_temp['绿化率'], errors='coerce')
data_temp.loc[data_temp['绿化率'] <= xh_green_first, '绿化率标签'] = '绿化率低'
data_temp.loc[(data_temp['绿化率'] > xh_green_first ) & (data_temp['绿化率'] < xh_green_second ), '绿化率标签'] = '绿化率中'
data_temp.loc[data_temp['绿化率'] >= xh_green_second, '绿化率标签'] = '绿化率高'

data_temp['总户数'] = pd.to_numeric(data_temp['总户数'], errors='coerce')
data_temp.loc[(data_temp['总户数'] >= data_temp['总户数'].quantile(0.85)) & (data_temp['总户数'] > xh_number), '小区规模标签'] = '大型社区'
data_temp.loc[data_temp['总户数'] <= data_temp['总户数'].quantile(0.15), '小区规模标签'] = '小型社区'

data_temp.loc[(data_temp['车位比'] <= xh_car_first),'停车位标签'] =  '停车位紧缺'
data_temp.loc[(data_temp['车位比'] > xh_car_first) & (data_temp['车位比'] < xh_car_second),'停车位标签'] =  '停车位一般'
data_temp.loc[data_temp['车位比'] >= xh_car_second,'停车位标签'] =  '停车位充裕'

data_temp['建成年份'] = pd.to_numeric(data_temp['建成年份'], errors='coerce')
data_temp.loc[data_temp['建成年份'] >= 2015, '楼龄标签'] = '次新房'
data_temp.loc[(data_temp['建成年份'] >= 2010) & (data_temp['建成年份'] < 2015), '楼龄标签'] = '6-10年楼龄'
data_temp.loc[(data_temp['建成年份'] >= 2005) & (data_temp['建成年份'] < 2010), '楼龄标签'] = '10-15年楼龄'
data_temp.loc[(data_temp['建成年份'] >= 2000) & (data_temp['建成年份'] < 2005), '楼龄标签'] = '15-20年楼龄'
data_temp.loc[data_temp['建成年份'] < 2000, '楼龄标签'] = '老旧小区'

data_temp.loc[data_temp['开发商'].isin(developers), '百强开发商标签'] = '百强开发商'
data_temp.loc[data_temp['物业公司'].isin(propertys), '百强物业标签'] = '百强物业'

data_temp.loc[data_temp['挂牌数量'] >= data_temp['挂牌数量'].quantile(0.85), '活跃度标签'] = '挂盘活跃'

data_temp.loc[data_temp['挂牌数量'] <= data_temp['挂牌数量'].quantile(0.15), '停车位标签'] = '挂盘不活跃'

data_temp.loc[data_temp['购物中心'] < xh_shopping, '购物中心标签'] = '近购物中心'
data_temp.loc[data_temp['三级甲等医院'] < xh_hospital, '三级甲等医院标签'] = '近三甲医院'
data_temp.loc[data_temp['政府机关'] < xh_government, '政府机关标签'] = '近政府机关'
data_temp.loc[data_temp['火车站'] < xh_trian, '火车站标签'] = '近火车站'
data_temp.loc[data_temp['景区'] < xh_viewpoint, '景区标签'] = '近景区'
data_temp.loc[data_temp['公园'] < xh_park, '公园标签'] = '近公园'
data_temp.loc[data_temp['地铁站'] < xh_subway, '地铁标签'] = '近地铁站'
data_temp.loc[data_temp['飞机场'] < xh_airport, '机场标签'] = '距离机场过近'
data_temp.loc[data_temp['丧葬设施'] > xh_funeal, '丧葬设施标签'] = '距离丧葬设施过近'
data_temp.loc[data_temp['工厂'] > xh_factory, '工厂标签'] = '距离工厂过近'
data_temp.loc[(data_temp['商务写字楼'] >= data_temp['商务写字楼'].quantile(0.85)) & (data_temp['商务写字楼'] > xh_business), '商务区标签'] = '商务区'

#配套设施
data_temp['配套设施分数'] = data_temp['公交车站相关分数'] + data_temp['三级甲等医院分数'] + data_temp['商场分数'] + data_temp['体育馆分数']
data_temp.loc[(data_temp['配套设施分数'] >= data_temp['配套设施分数'].quantile(0.85)), '配套设施标签'] = '配套设施完善'

# 交易活跃度
data_temp['交易活跃度分数'] = data_temp['成交数量分数'] + data_temp['挂牌数量分数']
transa_temp = data_temp[data_temp['交易活跃度分数'] > 0]['交易活跃度分数']
transa_first = transa_temp.quantile(0.8)
transa_second = transa_temp.quantile(0.3)
data_temp.loc[(data_temp['交易活跃度分数'] >= transa_first), '交易活跃度标签'] = '交易活跃'
data_temp.loc[(data_temp['交易活跃度分数'] >= transa_second) & (data_temp['交易活跃度分数'] < transa_first), '交易活跃度标签'] = '交易一般活跃'
data_temp.loc[(data_temp['交易活跃度分数'] < transa_second), '交易活跃度标签'] = '交易不活跃'
data_temp.loc[(data_temp['交易活跃度分数'] == 0), '交易活跃度标签'] = '无交易'

# 价格波动率
data_temp.loc[abs(data_temp['抗跌率']) >= 0.1, '价格波动率标签'] = '价格波动大'
data_temp.loc[(abs(data_temp['抗跌率']) > 0.05) & (abs(data_temp['抗跌率']) < 0.1) , '价格波动率标签'] = '价格波动一般'
data_temp.loc[abs(data_temp['抗跌率']) <= 0.05, '价格波动率标签'] = '价格稳定'

# 性价比
data_temp['性价比'] = data_temp['综合评分'] / data_temp['小区均价']

show_cols = data_temp.columns[data_temp.columns.str.contains('标签')]

# 筛选标签

show_cols = xh_label

# 计算展示标签
for idx, row in data_temp.iterrows():
    labels = []
    for col in show_cols:
        if pd.isna(row[col]):
            pass
        else:
            labels.append(row[col])

    data_temp.loc[idx, '展示标签'] = ','.join(labels)

return data_temp

def train(data):
use_cols = [ '容积率',
'绿化率',
'占地面积',
'建成年份',
'总楼层',
'城市中心',
'抗跌率',
'购物中心',
'三级甲等医院',
'高等院校',
'商务写字楼',
'公交车站相关',
'挂牌数量',
'路口名',
'商场',
'工厂',
'政府机关',
'丧葬设施',
'体育馆',
'公园',
'地铁站',
'小学',
'幼儿园',
'火车站',
'小区均价']

train_data = data[use_cols]
X = train_data[train_data['小区均价'].notna()].iloc[:, :-1]
y = train_data[train_data['小区均价'].notna()].iloc[:, -1]
model = XGBRegressor(n_jobs=-1)
param_grid = {'max_depth': np.arange(2, 8, 1),
              'gamma': np.arange(0.5, 0.8, 0.1),
              'colsample_bytree': np.arange(0.5, 0.8, 0.1)}
gs = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, verbose=1, cv=5, scoring=mape)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)
X_ = train_data.iloc[:, :-1]
y_ = gs.predict(X_)
data['综合评分'] = y_
return data

def change_poi(poi_data):
location = poi_data['location'].str.split(',')
list_longitude = []
list_latitude = []
[(list_longitude.append(x[0]), list_latitude.append(x[1])) for x in location]
poi_data['longitude'] = list_longitude
poi_data['latitude'] = list_latitude
poi_data['longitude'] =poi_data['longitude'].astype(float)
poi_data['latitude'] =poi_data['latitude'].astype(float)
return poi_data

def get_level(result):
result.loc[result['综合评分'] >= result['综合评分'].quantile(0.8), '综合评级'] = 'A'
result.loc[(result['综合评分'] < result['综合评分'].quantile(0.8)) &
(result['综合评分'] >= result['综合评分'].quantile(0.6)), '综合评级'] = 'B'
result.loc[(result['综合评分'] < result['综合评分'].quantile(0.6)) &
(result['综合评分'] >= result['综合评分'].quantile(0.4)), '综合评级'] = 'C'
result.loc[(result['综合评分'] < result['综合评分'].quantile(0.4)) &
(result['综合评分'] >= result['综合评分'].quantile(0.2)), '综合评级'] = 'D'
result.loc[result['综合评分'] < result['综合评分'].quantile(0.2), '综合评级'] = 'E'

# TODO: 小区均价 --> 小区价格评级
result.loc[result['小区均价'] >= result['小区均价'].quantile(0.8), '小区价格评级'] = 'A'
result.loc[(result['小区均价'] < result['小区均价'].quantile(0.8)) &
           (result['小区均价'] >= result['小区均价'].quantile(0.6)), '小区价格评级'] = 'B'
result.loc[(result['小区均价'] < result['小区均价'].quantile(0.6)) &
           (result['小区均价'] >= result['小区均价'].quantile(0.4)), '小区价格评级'] = 'C'
result.loc[(result['小区均价'] < result['小区均价'].quantile(0.4)) &
           (result['小区均价'] >= result['小区均价'].quantile(0.2)), '小区价格评级'] = 'D'
result.loc[result['小区均价'] < result['小区均价'].quantile(0.2), '小区价格评级'] = 'E'

# TODO: 填充评级
for idx, row in result.iterrows():
    if pd.isna(row['小区价格评级']):
        result.loc[idx, '小区价格评级'] = row['综合评级']

# TODO: 数据重排
n = result.shape[0]
for col in ['楼盘品质', '宜居程度', '区位状况',
            '生活配套', '活跃程度', '不利因素','综合评分']:
    result.sort_values(col, inplace=True)
    result[col] = np.linspace(5, 10, n)

return result

def get_risk(all_community_data):
list_one = ['A'] * 5 + ['B'] * 5 + ['C'] * 5 + ['D'] * 5 + ['E'] * 5
list_two = ['A', 'B', 'C', 'D', 'E'] * 5
list_str = ['正常', '关注', '谨慎', '谨慎', '谨慎', '正常', '正常', '关注', '谨慎', '谨慎', '正常', '正常', '正常', '关注', '谨慎', '正常', '正常',
'关注', '谨慎', '谨慎', '正常', '关注', '谨慎', '谨慎', '谨慎', ]
risk_data = pd.DataFrame({'小区价格评级': list_one, '综合评级': list_two, '策略': list_str})
list_risk = []
for price, score in all_community_data[['小区价格评级', '综合评级']].values:
risk_temp = risk_data[(risk_data['小区价格评级'] == price) & (risk_data['综合评级'] == score)]['策略'].values[0]
list_risk.append(risk_temp)
all_community_data['风险策略'] = list_risk
return all_community_data

def high_quality_community(data):
df = data.copy()
df = df[df['综合评级'].isin(['C', 'B', 'A'])]
print('综合评级:',len(df))
df = df[df['小区价格评级'].isin(['A', 'B', 'C', 'D'])]
print('小区价格评级:',len(df))
df = df[df['风险策略'].isin(['正常', '关注'])]
print('风险策略:',len(df))
df = df[df['房屋类型(小类)'] == '普通住宅']
print('房屋类型:',len(df))
df = df[df['建成年份'] > 2000]
print('建成年份:',len(df))

up = 0.05

down = 0

df = df[(np.abs(df['抗跌率']) >= down) & (np.abs(df['抗跌率']) <=up)]

df  = df[df['价格波动率标签'] == '价格稳定']
print('价格波动率:',len(df))
# 变现能力
df = df[df['法拍数量'] == 0]
print('法拍数量:',len(df))
# 性价比
df = df[df['性价比'] > df['性价比'].quantile(0.6) ]
print('性价比:',len(df))
df = df[df['价格方差'] < df['价格方差'].quantile(0.4) ]
print('价格集中度:',len(df))
data.loc[df.index, '优质小区'] = '是'
return data

if name=='main':

print('城市: ', city)
start_time1 = time.time()

# 获取poi数据
poi_data = change_poi(poi)

# 获取小区清单数据
read_community_data = community
read_community_data.dropna(subset=['高德经度'],inplace=True)

# TODO: 解析POI数据
start_time = time.time()
read_community_data = get_turth_distance(poi_data, read_community_data)
read_community_data = get_turth_number(poi_data, read_community_data)
print('poi数据运行时间:',time.time() - start_time)

# TODO: 解析案例数据
start_time = time.time()
case_value = CaseValue(city)
result = case_value.result
read_community_data = pd.merge(read_community_data, result, on='楼盘ID', how='left')
print('案例数据运行时间:',time.time() - start_time)

# TODO: 处理小区清单数据
start_time = time.time()
read_community_data=community_data(read_community_data)
print('全部运行时间:',time.time() - start_time1)


# TODO: 原始值出分
read_community_data = value2score(read_community_data)
# read_community_data.to_excel('训练数据_%s.xlsx'%city)

# TODO: 训练模型
read_community_data = train(read_community_data)

# TODO: 出标签
start_time = time.time()
read_community_data = get_label(read_community_data, )
print('出标签时间:',time.time() - start_time)


# TODO: 小区评级
result = get_level(read_community_data)



# TODO: 风险策略
result = get_risk(result)


# TODO: 添加优质小区
result = high_quality_community(result)

result.to_excel('小区画像_%s.xlsx'%city, index=False)
result_cols = [
    '省份','城市','城市代号','行政区','行政区代号','楼盘名称','楼盘ID','地址',
    '小区均价','小区价格评级','综合评级','综合评分','风险策略','展示标签','楼盘品质',
    '宜居程度','区位状况','生活配套','活跃程度','不利因素','优质小区']

result = result[result_cols]
result.to_excel('小区评级结果表_%s.xlsx'%city,index=False)
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 219,427评论 6 508
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,551评论 3 395
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 165,747评论 0 356
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,939评论 1 295
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,955评论 6 392
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,737评论 1 305
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,448评论 3 420
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,352评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,834评论 1 317
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,992评论 3 338
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 40,133评论 1 351
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,815评论 5 346
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,477评论 3 331
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 32,022评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,147评论 1 272
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,398评论 3 373
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 45,077评论 2 355

推荐阅读更多精彩内容