导入库
1
2
3
4
5
6
7
8
9
10 import time # 时间库
import numpy as np # numpy库
import pandas as pd # pandas库
import pymysql # mysql连接库
from sklearn.ensemble import RandomForestClassifier # RF库
# from pyecharts import Bar3D # 老版本代码,3D柱形图
from pyecharts.charts import Bar3D # 新版本diam,3D柱形图
from pyecharts import options as opts # 新版本代码,先导入配置方法库
读取数据
1
2 sheet_names = ['2015','2016','2017','2018','会员等级']
sheet_datas = [pd.read_excel('sales.xlsx',sheet_name=i) for i in sheet_names]
数组数据查看
1
2
3
4
5
6 for each_name,each_data in zip(sheet_names,sheet_datas):
print('[data summary for {0:=^50}]'.format(each_name))
print('Overview:','\n',each_data.head(4))# 展示数据前4条
print('DESC:','\n',each_data.describe())# 数据描述性信息
print('NA records',each_data.isnull().any(axis=1).sum()) # 缺失值记录数
print('Dtypes',each_data.dtypes) # 数据类型
数据预处理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 # 去除缺失值和异常值
for ind,each_data in enumerate(sheet_datas[:-1]):
sheet_datas[ind] = each_data.dropna()# 丢弃缺失值记录
sheet_datas[ind] = each_data[each_data['订单金额'] > 1]# 丢弃订单金额<=1的记录
sheet_datas[ind]['max_year_date'] = each_data['提交日期'].max() # 增加一列最大日期值
# 汇总所有数据
data_merge = pd.concat(sheet_datas[:-1],axis=0)
# 获取各自年份数据
data_merge['date_interval'] = data_merge['max_year_date']-data_merge['提交日期']
data_merge['year'] = data_merge['提交日期'].dt.year
# 转换日期间隔为数字
data_merge['date_interval'] = data_merge['date_interval'].apply(lambda x: x.days) # 转换日期间隔为数字
#data_merge.head()
# 按会员ID做汇总
rfm_gb = data_merge.groupby(['year','会员ID'],as_index=False).agg({'date_interval': 'min', # 计算最近一次订单时间
'提交日期': 'count', # 计算订单频率
'订单金额': 'sum'}) # 计算订单总金额
# 重命名列名
rfm_gb.columns = ['year','会员ID','r','f','m']
rfm_gb.head()
数据分块
1
2
3
4
5
6
7 # 查看数据分布
desc_pd = rfm_gb.iloc[:,2:].describe().T
print(desc_pd)
# 定义区间边界
r_bins = [-1,79,255,365] # 注意起始边界小于最小值
f_bins = [0,2,5,130]
m_bins = [0,69,1199,206252]
计算权重
1
2
3
4
5
6
7
8 # 匹配会员等级和rfm得分
rfm_merge = pd.merge(rfm_gb,sheet_datas[-1],on='会员ID',how='inner')
# rf获得rfm因子得分
clf = RandomForestClassifier()
clf = clf.fit(rfm_merge[['r','f','m']],rfm_merge['会员等级'])
weights = clf.feature_importances_
print('feature importance:',weights)
RFM计算过程
# RFM分箱得分
rfm_gb[‘r_score’] = pd.cut(rfm_gb[‘r’], r_bins, labels=[i for i in range(len(r_bins)-1,0,-1)]) # 计算R得分
rfm_gb[‘f_score’] = pd.cut(rfm_gb[‘f’], f_bins, labels=[i+1 for i in range(len(f_bins)-1)]) # 计算F得分
rfm_gb[‘m_score’] = pd.cut(rfm_gb[‘m’], m_bins, labels=[i+1 for i in range(len(m_bins)-1)]) # 计算M得分
# 计算RFM总得分
# 方法一:加权得分
rfm_gb = rfm_gb.apply(np.int32) # cate转数值
rfm_gb[‘rfm_score’] = rfm_gb[‘r_score’] * weights[0] + rfm_gb[‘f_score’] * weights[1] + rfm_gb[
‘m_score’] * weights[2]
# 方法二:RFM组合
rfm_gb[‘r_score’] = rfm_gb[‘r_score’].astype(np.str)
rfm_gb[‘f_score’] = rfm_gb[‘f_score’].astype(np.str)
rfm_gb[‘m_score’] = rfm_gb[‘m_score’].astype(np.str)
rfm_gb[‘rfm_group’] = rfm_gb[‘r_score’].str.cat(rfm_gb[‘f_score’]).str.cat(
rfm_gb[‘m_score’])
保存数据
1 rfm_gb.to_excel('sales_rfm_score.xlsx') # 保存数据为Excel
图形展现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 # 图形数据汇总
display_data = rfm_gb.groupby(['rfm_group','year'],as_index=False)['会员ID'].count()
display_data.columns = ['rfm_group','year','number']
display_data['rfm_group'] = display_data['rfm_group'].astype(np.int32)
display_data.head()
# 新版本
bar3d = Bar3D(init_opts=opts.InitOpts(width="900px", height="600px"))
range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
'#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
data = [d.tolist() for d in display_data.values]
bar3d.add(
series_name="rfm分组结果",
data=data,
xaxis3d_opts=opts.Axis3DOpts(type_="category"),
yaxis3d_opts=opts.Axis3DOpts(type_="category"),
zaxis3d_opts=opts.Axis3DOpts(type_="value")
)
bar3d.set_global_opts(
visualmap_opts=opts.VisualMapOpts(
max_ =display_data['number'].max(),
range_color=range_color,
)
)
bar3d.render_notebook()