数字化运营基础技能 – 客户特征的聚类分析实践案例

释放双眼,带上耳机,听听看~!

1
2
3
4
5
6
7
8
# 导入库
import pandas as pd # panda库
import numpy as np
import matplotlib.pyplot as plt  # 导入matplotlib库
from sklearn.preprocessing import MinMaxScaler # 标准化库
from sklearn.cluster import KMeans  # 导入sklearn聚类模块
from sklearn.metrics import silhouette_score   # 效果评估模块, 新版本中已经没有 calinski_harabaz_score 方法
import matplotlib.pyplot as plt # 图形库

# 读取数据

raw_data = pd.read_csv(‘cluster.txt’)  # 导入数据文件

numeric_features = raw_data.iloc[:,1:3] # 数值型特征


1
2
3
4
# 数据标准化
scaler = MinMaxScaler()
scaled_numeric_features = scaler.fit_transform(numeric_features)
print(scaled_numeric_features[:,:2])

1
2
3
4
# 训练聚类模型
n_clusters = 3  # 设置聚类数量
model_kmeans = KMeans(n_clusters=n_clusters, random_state=0)  # 建立聚类模型对象
model_kmeans.fit(scaled_numeric_features)  # 训练聚类模型

1
2
3
4
5
6
7
8
9
10
11
12
13
# 模型效果指标评估
# 总样本量,总特征数
n_samples, n_features = raw_data.iloc[:,1:].shape
print('samples: %d \t features: %d' % (n_samples, n_features))

# 非监督式评估方法
silhouette_s = silhouette_score(scaled_numeric_features, model_kmeans.labels_, metric='euclidean')  # 平均轮廓系数
# calinski_harabaz_s = calinski_harabaz_score(scaled_numeric_features, model_kmeans.labels_)  # 老版本有,新版本没有该方法了
# unsupervised_data = {'silh':[silhouette_s],'c&h':[calinski_harabaz_s]} # 老版本方法
unsupervised_data = {'silh':[silhouette_s]} # 新版本方法
unsupervised_score = pd.DataFrame.from_dict(unsupervised_data)
print('\n','unsupervised score:','\n','-'*60)
print(unsupervised_score)

1
2
3
4
5
6
# 合并数据和特征
# 获得每个样本的聚类类别
kmeans_labels = pd.DataFrame(model_kmeans.labels_,columns=['labels'])
# 组合原始数据与标签
kmeans_data = pd.concat((raw_data,kmeans_labels),axis=1)
print(kmeans_data.head())

1
2
3
4
5
6
# 计算不同聚类类别的样本量和占比
label_count = kmeans_data.groupby(['labels'])['SEX'].count()  # 计算频数
label_count_rate = label_count/ kmeans_data.shape[0] # 计算占比
kmeans_record_count = pd.concat((label_count,label_count_rate),axis=1)
kmeans_record_count.columns=['record_count','record_rate']
print(kmeans_record_count.head())

1
2
3
4
5
6
# 计算不同聚类类别数值型特征 'AVG_ORDERS',
kmeans_numeric_features = kmeans_data.groupby(['labels'])['AVG_MONEY'].mean()

kmeans_avg_orders = kmeans_data.groupby(['labels'])['AVG_ORDERS'].mean()
print(kmeans_numeric_features)
print(kmeans_numeric_features.head())

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 计算不同聚类类别分类型特征
active_list = []
sex_gb_list = []
unique_labels = np.unique(model_kmeans.labels_)
for each_label in unique_labels:
    each_data = kmeans_data[kmeans_data['labels']==each_label]
    active_list.append(each_data.groupby(['IS_ACTIVE'])['USER_ID'].count()/each_data.shape[0])
    sex_gb_list.append(each_data.groupby(['SEX'])['USER_ID'].count()/each_data.shape[0])

kmeans_active_pd = pd.DataFrame(active_list)
kmeans_sex_gb_pd = pd.DataFrame(sex_gb_list)
kmeans_string_features = pd.concat((kmeans_active_pd,kmeans_sex_gb_pd),axis=1)
kmeans_string_features.index = unique_labels
print(kmeans_string_features.head())

1
2
features_all2 = pd.concat((kmeans_record_count,kmeans_numeric_features,kmeans_avg_orders,kmeans_string_features),axis=1)
print(features_all2.head())

1
2
3
# 合并所有类别的分析结果
features_all =  pd.concat((kmeans_record_count,kmeans_numeric_features,kmeans_avg_orders,kmeans_string_features),axis=1)
print(features_all.head())

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# 可视化图形展示
# part 1 全局配置
fig = plt.figure(figsize=(10, 7))
titles = ['RECORD_RATE','AVG_ORDERS','AVG_MONEY','IS_ACTIVE','SEX'] # 共用标题
line_index,col_index = 3,5 # 定义网格数
ax_ids = np.arange(1,16).reshape(line_index,col_index) # 生成子网格索引值
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
   
# part 2 画出三个类别的占比
pie_fracs = features_all['record_rate'].tolist()
for ind in range(len(pie_fracs)):
    ax = fig.add_subplot(line_index, col_index, ax_ids[:,0][ind])
    init_labels = ['','',''] # 初始化空label标签
    init_labels[ind] = 'cluster_{0}'.format(ind) # 设置标签
    init_colors = ['lightgray', 'lightgray', 'lightgray']
    init_colors[ind] = 'g' # 设置目标面积区别颜色
    ax.pie(x=pie_fracs, autopct='%3.0f %%',labels=init_labels,colors=init_colors)
    ax.set_aspect('equal') # 设置饼图为圆形
    if ind == 0:
        ax.set_title(titles[0])
   
# part 3  画出AVG_ORDERS均值
avg_orders_label = 'AVG_ORDERS'
avg_orders_fraces = features_all[avg_orders_label]

for ind, frace in enumerate(avg_orders_fraces):
    ax = fig.add_subplot(line_index, col_index, ax_ids[:,1][ind])
    ax.bar(x=unique_labels,height=[0,avg_orders_fraces[ind],0])# 画出柱形图
    ax.set_ylim((0, max(avg_orders_fraces)*1.2))
    ax.set_xticks([])
    ax.set_yticks([])
    if ind == 0:# 设置总标题
        ax.set_title(titles[1])
    # 设置每个柱形图的数值标签和x轴label
    ax.text(unique_labels[1],frace+0.4,s='{:.2f}'.format(frace),ha='center',va='top')
    ax.text(unique_labels[1],-0.4,s=avg_orders_label,ha='center',va='bottom')
       
# part 4  画出AVG_MONEY均值
avg_money_label = 'AVG_MONEY'
avg_money_fraces = features_all[avg_money_label]
for ind, frace in enumerate(avg_money_fraces):
    ax = fig.add_subplot(line_index, col_index, ax_ids[:,2][ind])
    ax.bar(x=unique_labels,height=[0,avg_money_fraces[ind],0])# 画出柱形图
    ax.set_ylim((0, max(avg_money_fraces)*1.2))
    ax.set_xticks([])
    ax.set_yticks([])
    if ind == 0:# 设置总标题
        ax.set_title(titles[2])
    # 设置每个柱形图的数值标签和x轴label
    ax.text(unique_labels[1],frace+4,s='{:.0f}'.format(frace),ha='center',va='top')
    ax.text(unique_labels[1],-4,s=avg_money_label,ha='center',va='bottom')
       
# part 5  画出是否活跃
axtivity_labels = ['不活跃','活跃']
x_ticket = [i for i in range(len(axtivity_labels))]
activity_data = features_all[axtivity_labels]
ylim_max = np.max(np.max(activity_data))
for ind,each_data in enumerate(activity_data.values):
    ax = fig.add_subplot(line_index, col_index, ax_ids[:,3][ind])
    ax.bar(x=x_ticket,height=each_data) # 画出柱形图
    ax.set_ylim((0, ylim_max*1.2))
    ax.set_xticks([])
    ax.set_yticks([])    
    if ind == 0:# 设置总标题
        ax.set_title(titles[3])
    # 设置每个柱形图的数值标签和x轴label
    activity_values = ['{:.1%}'.format(i) for i in each_data]
    for i in range(len(x_ticket)):
        ax.text(x_ticket[i],each_data[i]+0.05,s=activity_values[i],ha='center',va='top')
        ax.text(x_ticket[i],-0.05,s=axtivity_labels[i],ha='center',va='bottom')
       
# part 6  画出性别分布
sex_data = features_all.iloc[:,-3:]
x_ticket = [i for i in range(len(sex_data))]
sex_labels = ['SEX_{}'.format(i) for i in range(3)]
ylim_max = np.max(np.max(sex_data))
for ind,each_data in enumerate(sex_data.values):
    ax = fig.add_subplot(line_index, col_index, ax_ids[:,4][ind])
    ax.bar(x=x_ticket,height=each_data) # 画柱形图
    ax.set_ylim((0, ylim_max*1.2))
    ax.set_xticks([])
    ax.set_yticks([])
    if ind == 0: # 设置标题
       ax.set_title(titles[4])    
    # 设置每个柱形图的数值标签和x轴label
    sex_values = ['{:.1%}'.format(i) for i in each_data]
    for i in range(len(x_ticket)):
        ax.text(x_ticket[i],each_data[i]+0.1,s=sex_values[i],ha='center',va='top')
        ax.text(x_ticket[i],-0.1,s=sex_labels[i],ha='center',va='bottom')
   
plt.tight_layout(pad=0.8) #设置默认的间距

数字化运营基础技能 – 客户特征的聚类分析实践案例

给TA打赏
共{{data.count}}人
人已打赏
安全运维

安全运维之道:发现、解决问题的有效闭环

2024-4-14 20:59:36

安全运维

稳定性建设 – 架构优化的关键策略

2025-2-11 17:15:56

个人中心
购物车
优惠劵
今日签到
有新私信 私信列表
搜索