实战》第7章"/>
《Python数据分析与挖掘实战》第7章
本文是基于《Python数据分析与挖掘实战》的第七章的数据——《航空公司客户价值分析》做的分析对部分代码,做出补充,对原文中的雷达图进行了实现。
1. 背景与目标分析
此项目旨在根据航空公司提供的数据,对其客户进行分类,并且比较不同类别客户的价值,为能够更好的为客户提供个性化服务做参考。
2. 整体流程如下
import pandas as pd
import numpy as np
data = pd.read_csv('air_data.csv', encoding='utf8')
data.head()
MEMBER_NO | FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | ... | ADD_Point_SUM | Eli_Add_Point_Sum | L1Y_ELi_Add_Points | Points_Sum | L1Y_Points_Sum | Ration_L1Y_Flight_Count | Ration_P1Y_Flight_Count | Ration_P1Y_BPS | Ration_L1Y_BPS | Point_NotFlight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 54993 | 2006/11/02 | 2008/12/24 | 男 | 6 | . | 北京 | CN | 31.0 | 2014/03/31 | ... | 39992 | 114452 | 111100 | 619760 | 370211 | 0.509524 | 0.490476 | 0.487221 | 0.512777 | 50 |
1 | 28065 | 2007/02/19 | 2007/08/03 | 男 | 6 | NaN | 北京 | CN | 42.0 | 2014/03/31 | ... | 12000 | 53288 | 53288 | 415768 | 238410 | 0.514286 | 0.485714 | 0.489289 | 0.510708 | 33 |
2 | 55106 | 2007/02/01 | 2007/08/30 | 男 | 6 | . | 北京 | CN | 40.0 | 2014/03/31 | ... | 15491 | 55202 | 51711 | 406361 | 233798 | 0.518519 | 0.481481 | 0.481467 | 0.518530 | 26 |
3 | 21189 | 2008/08/22 | 2008/08/23 | 男 | 5 | Los Angeles | CA | US | 64.0 | 2014/03/31 | ... | 0 | 34890 | 34890 | 372204 | 186100 | 0.434783 | 0.565217 | 0.551722 | 0.448275 | 12 |
4 | 39546 | 2009/04/10 | 2009/04/15 | 男 | 6 | 贵阳 | 贵州 | CN | 48.0 | 2014/03/31 | ... | 22704 | 64969 | 64969 | 338813 | 210365 | 0.532895 | 0.467105 | 0.469054 | 0.530943 | 39 |
5 rows × 44 columns
import chardetf = open('air_data.csv', 'rb')
data_1 = f.read()
print(chardet.detect(data_1))
{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
print(type(data))
# print(data.head())
data.isnull().sum()
<class 'pandas.core.frame.DataFrame'>MEMBER_NO 0
FFP_DATE 0
FIRST_FLIGHT_DATE 0
GENDER 3
FFP_TIER 0
WORK_CITY 2269
WORK_PROVINCE 3248
WORK_COUNTRY 26
AGE 420
LOAD_TIME 0
FLIGHT_COUNT 0
BP_SUM 0
EP_SUM_YR_1 0
EP_SUM_YR_2 0
SUM_YR_1 551
SUM_YR_2 138
SEG_KM_SUM 0
WEIGHTED_SEG_KM 0
LAST_FLIGHT_DATE 0
AVG_FLIGHT_COUNT 0
AVG_BP_SUM 0
BEGIN_TO_FIRST 0
LAST_TO_END 0
AVG_INTERVAL 0
MAX_INTERVAL 0
ADD_POINTS_SUM_YR_1 0
ADD_POINTS_SUM_YR_2 0
EXCHANGE_COUNT 0
avg_discount 0
P1Y_Flight_Count 0
L1Y_Flight_Count 0
P1Y_BP_SUM 0
L1Y_BP_SUM 0
EP_SUM 0
ADD_Point_SUM 0
Eli_Add_Point_Sum 0
L1Y_ELi_Add_Points 0
Points_Sum 0
L1Y_Points_Sum 0
Ration_L1Y_Flight_Count 0
Ration_P1Y_Flight_Count 0
Ration_P1Y_BPS 0
Ration_L1Y_BPS 0
Point_NotFlight 0
dtype: int64
data.describe()
MEMBER_NO | FFP_TIER | AGE | FLIGHT_COUNT | BP_SUM | EP_SUM_YR_1 | EP_SUM_YR_2 | SUM_YR_1 | SUM_YR_2 | SEG_KM_SUM | ... | ADD_Point_SUM | Eli_Add_Point_Sum | L1Y_ELi_Add_Points | Points_Sum | L1Y_Points_Sum | Ration_L1Y_Flight_Count | Ration_P1Y_Flight_Count | Ration_P1Y_BPS | Ration_L1Y_BPS | Point_NotFlight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 62988.000000 | 62988.000000 | 62568.000000 | 62988.000000 | 62988.000000 | 62988.0 | 62988.000000 | 62437.000000 | 62850.000000 | 62988.000000 | ... | 62988.000000 | 62988.000000 | 62988.000000 | 62988.0000 | 62988.000000 | 62988.000000 | 62988.000000 | 62988.000000 | 62988.000000 | 62988.000000 |
mean | 31494.500000 | 4.102162 | 42.476346 | 11.839414 | 10925.081254 | 0.0 | 265.689623 | 5355.376064 | 5604.026014 | 17123.878691 | ... | 1355.006223 | 1620.695847 | 1080.378882 | 12545.7771 | 6638.739585 | 0.486419 | 0.513581 | 0.522293 | 0.468422 | 2.728155 |
std | 18183.213715 | 0.373856 | 9.885915 | 14.049471 | 16339.486151 | 0.0 | 1645.702854 | 8109.450147 | 8703.364247 | 20960.844623 | ... | 7868.477000 | 8294.398955 | 5639.857254 | 20507.8167 | 12601.819863 | 0.319105 | 0.319105 | 0.339632 | 0.338956 | 7.364164 |
min | 1.000000 | 4.000000 | 6.000000 | 2.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 368.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 15747.750000 | 4.000000 | 35.000000 | 3.000000 | 2518.000000 | 0.0 | 0.000000 | 1003.000000 | 780.000000 | 4747.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 2775.0000 | 700.000000 | 0.250000 | 0.288889 | 0.258150 | 0.167954 | 0.000000 |
50% | 31494.500000 | 4.000000 | 41.000000 | 7.000000 | 5700.000000 | 0.0 | 0.000000 | 2800.000000 | 2773.000000 | 9994.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 6328.5000 | 2860.500000 | 0.500000 | 0.500000 | 0.514252 | 0.476747 | 0.000000 |
75% | 47241.250000 | 4.000000 | 48.000000 | 15.000000 | 12831.000000 | 0.0 | 0.000000 | 6574.000000 | 6845.750000 | 21271.250000 | ... | 0.000000 | 345.000000 | 0.000000 | 14302.5000 | 7500.000000 | 0.711111 | 0.750000 | 0.815091 | 0.728375 | 1.000000 |
max | 62988.000000 | 6.000000 | 110.000000 | 213.000000 | 505308.000000 | 0.0 | 74460.000000 | 239560.000000 | 234188.000000 | 580717.000000 | ... | 984938.000000 | 984938.000000 | 728282.000000 | 985572.0000 | 728282.000000 | 1.000000 | 1.000000 | 0.999989 | 0.999993 | 140.000000 |
8 rows × 36 columns
data = data[data['SUM_YR_1'].notnull() & data['SUM_YR_2'].notnull()]
data_1 = data['SUM_YR_1'] != 0
data_2 = data['SUM_YR_2'] != 0
data_3 = (data['SEG_KM_SUM'] == 0) & (data['avg_discount'] == 0)
data = data[data_1 | data_2 | data_3] #该规则是“或”
data.head()
data.describe()
MEMBER_NO | FFP_TIER | AGE | FLIGHT_COUNT | BP_SUM | EP_SUM_YR_1 | EP_SUM_YR_2 | SUM_YR_1 | SUM_YR_2 | SEG_KM_SUM | ... | ADD_Point_SUM | Eli_Add_Point_Sum | L1Y_ELi_Add_Points | Points_Sum | L1Y_Points_Sum | Ration_L1Y_Flight_Count | Ration_P1Y_Flight_Count | Ration_P1Y_BPS | Ration_L1Y_BPS | Point_NotFlight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 62044.000000 | 62044.000000 | 61632.000000 | 62044.000000 | 62044.000000 | 62044.0 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | ... | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 | 62044.000000 |
mean | 31485.237928 | 4.103652 | 42.504300 | 11.971359 | 11057.772468 | 0.0 | 269.732093 | 5389.298164 | 5676.826688 | 17321.694749 | ... | 1367.336729 | 1637.068822 | 1092.360228 | 12694.841290 | 6726.731416 | 0.489666 | 0.510334 | 0.519388 | 0.471873 | 2.754191 |
std | 18188.650537 | 0.376322 | 9.885877 | 14.110619 | 16424.944888 | 0.0 | 1657.846655 | 8123.849287 | 8736.092628 | 21052.728111 | ... | 7906.967903 | 8336.741957 | 5671.520660 | 20617.694168 | 12671.910738 | 0.317091 | 0.317091 | 0.337879 | 0.337318 | 7.399359 |
min | 1.000000 | 4.000000 | 6.000000 | 2.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 368.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 15715.750000 | 4.000000 | 35.000000 | 3.000000 | 2599.000000 | 0.0 | 0.000000 | 1024.000000 | 856.000000 | 4874.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 2856.000000 | 763.000000 | 0.250000 | 0.285714 | 0.256762 | 0.179310 | 0.000000 |
50% | 31476.500000 | 4.000000 | 41.000000 | 7.000000 | 5816.000000 | 0.0 | 0.000000 | 2832.000000 | 2838.000000 | 10200.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 6457.000000 | 2929.500000 | 0.500000 | 0.500000 | 0.511677 | 0.480269 | 0.000000 |
75% | 47247.250000 | 4.000000 | 48.000000 | 15.000000 | 13002.250000 | 0.0 | 0.000000 | 6617.000000 | 6928.000000 | 21522.500000 | ... | 0.000000 | 381.000000 | 0.000000 | 14478.250000 | 7609.000000 | 0.714286 | 0.750000 | 0.804233 | 0.730684 | 1.000000 |
max | 62988.000000 | 6.000000 | 110.000000 | 213.000000 | 505308.000000 | 0.0 | 74460.000000 | 239560.000000 | 234188.000000 | 580717.000000 | ... | 984938.000000 | 984938.000000 | 728282.000000 | 985572.000000 | 728282.000000 | 1.000000 | 1.000000 | 0.999989 | 0.999993 | 140.000000 |
8 rows × 36 columns
data.to_csv('air_data_cleaned.csv') # 导出结果
data1 = pd.DataFrame(columns=['R', 'L', 'F', 'M', 'C'])
# b['time_interval']=pd.to_datetime(b['xxx'])-pd.to_datetime(b['xxx'])
data1['L'] = pd.to_datetime(data.LOAD_TIME)-pd.to_datetime(data.FFP_DATE)
data1['R'] = data.LAST_TO_END
data1['M'] = data.SEG_KM_SUM
data1['C'] = data.avg_discount
data1['F'] = data.FLIGHT_COUNT
data1.info()
data1['L'] = data1['L']/np.timedelta64(1, 'D') # 将日期转化为数值,单位为天
<class 'pandas.core.frame.DataFrame'>
Int64Index: 62044 entries, 0 to 62978
Data columns (total 5 columns):
R 62044 non-null int64
L 62044 non-null timedelta64[ns]
F 62044 non-null int64
M 62044 non-null int64
C 62044 non-null float64
dtypes: float64(1), int64(3), timedelta64[ns](1)
memory usage: 2.8 MB
data2 = (data1-data1.mean(axis=0))/(data1.std(axis=0)) #标准差标准化
from sklearn.cluster import KMeans
k=5
kmodel = KMeans(n_clusters = k, n_jobs=4)
kmodel.fit(data2)print(kmodel.cluster_centers_)
print(kmodel.labels_)
[[-0.37722119 1.16066672 -0.08691852 -0.09484404 -0.1559046 ][ 1.68625847 -0.31367829 -0.57401599 -0.53682019 -0.1733261 ][-0.41488827 -0.70020646 -0.16114258 -0.16095751 -0.25513154][-0.00266813 0.05184279 -0.22680311 -0.23125407 2.19134701][-0.79938326 0.48332845 2.4832016 2.42472391 0.30863003]]
[4 4 4 ... 2 1 1]
labels = kmodel.labels_
df1 = pd.DataFrame(labels, columns = ['numbers'])
df2 = pd.DataFrame(kmodel.cluster_centers_, columns=data1.columns)
df3 = df1['numbers'].value_counts()
df4 = pd.concat([df3, df2], axis=1)
# df4
df4
numbers | R | L | F | M | C | |
---|---|---|---|---|---|---|
0 | 15740 | -0.377221 | 1.160667 | -0.086919 | -0.094844 | -0.155905 |
1 | 12125 | 1.686258 | -0.313678 | -0.574016 | -0.536820 | -0.173326 |
2 | 24659 | -0.414888 | -0.700206 | -0.161143 | -0.160958 | -0.255132 |
3 | 4184 | -0.002668 | 0.051843 | -0.226803 | -0.231254 | 2.191347 |
4 | 5336 | -0.799383 | 0.483328 | 2.483202 | 2.424724 | 0.308630 |
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname="C:\Windows\Fonts\simhei.ttf")def plot_radar(data):'''the first column is the number of each cluster;the last are those to describe the center of each cluster.'''kinds = list(df4.index)kinds1=['客户群1', '客户群2', '客户群3', '客户群4', '客户群5']labels = data.iloc[:, 1:].columnssam = ['r-.', 'o-.', 'g--', 'b-', 'p:'] # 样式centers = pd.concat([data.iloc[:, 1:], data.iloc[:,1]], axis=1) # 形成闭环centers = np.array(centers)n = len(labels)angles = np.linspace(0, 2*np.pi, n, endpoint=False)angles = np.concatenate((angles, [angles[0]])) fig = plt.figure(figsize=(7,7),dpi=130)ax = fig.add_subplot(111, polar=True) # 设置坐标为极坐标# 画若干个五边形floor = np.floor(centers.min()) # 大于最小值的最大整数ceil = np.ceil(centers.max()) # 小于最大值的最小整数for i in np.arange(floor, ceil + 0.5, 0.5):ax.plot(angles, [i] * (n + 1), '--', lw=0.5 , color='black')# 画不同客户群的分割线for i in range(n):ax.plot([angles[i], angles[i]], [floor, ceil], '--', lw=0.5, color='black')# 画不同的客户群所占的大小for i in range(len(kinds)):ax.plot(angles, centers[i], sam[i], lw=2, label=kinds1[i])ax.fill(angles, centers[i], alpha=0.25) # ax.set_thetagrids(angles * 180 / np.pi, labels) # 设置显示的角度,将弧度转换为角度plt.title('客户群特征分布图', font_properties=my_font) # 添加标题plt.legend(prop=my_font, loc='lower right', bbox_to_anchor=(1.5, 0.0)) # 设置图例的位置,在画布外ax.set_theta_zero_location('N') # 设置极坐标的起点(即0°)在正北方向,即相当于坐标轴逆时针旋转90°ax.spines['polar'].set_visible(False) # 不显示极坐标最外圈的圆ax.set_ylim(floor,ceil) # 设置雷达图的范围 不设置中心就变成了五星为中心ax.grid(False) # 不显示默认的分割线ax.set_yticks( [i for i in np.arange(floor, ceil, 0.5)]) # 显示坐标间隔plt.show()
plot_radar(df4)
更多推荐
《Python数据分析与挖掘实战》第7章
发布评论