《Python数据分析与挖掘实战》第7章

编程入门行业动态更新时间:2024-10-27 16:34:20

《Python数据分析与挖掘<a href=https://www.elefans.com/category/jswz/34/1769775.html style= 实战》第7章"/>

《Python数据分析与挖掘实战》第7章

本文是基于《Python数据分析与挖掘实战》的第七章的数据——《航空公司客户价值分析》做的分析对部分代码，做出补充，对原文中的雷达图进行了实现。

1. 背景与目标分析

此项目旨在根据航空公司提供的数据，对其客户进行分类，并且比较不同类别客户的价值，为能够更好的为客户提供个性化服务做参考。

2. 整体流程如下

import pandas as pd
import numpy as np

data = pd.read_csv('air_data.csv', encoding='utf8')

data.head()

	MEMBER_NO	FFP_DATE	FIRST_FLIGHT_DATE	GENDER	FFP_TIER	WORK_CITY	WORK_PROVINCE	WORK_COUNTRY	AGE	LOAD_TIME	...	ADD_Point_SUM	Eli_Add_Point_Sum	L1Y_ELi_Add_Points	Points_Sum	L1Y_Points_Sum	Ration_L1Y_Flight_Count	Ration_P1Y_Flight_Count	Ration_P1Y_BPS	Ration_L1Y_BPS	Point_NotFlight
0	54993	2006/11/02	2008/12/24	男	6	.	北京	CN	31.0	2014/03/31	...	39992	114452	111100	619760	370211	0.509524	0.490476	0.487221	0.512777	50
1	28065	2007/02/19	2007/08/03	男	6	NaN	北京	CN	42.0	2014/03/31	...	12000	53288	53288	415768	238410	0.514286	0.485714	0.489289	0.510708	33
2	55106	2007/02/01	2007/08/30	男	6	.	北京	CN	40.0	2014/03/31	...	15491	55202	51711	406361	233798	0.518519	0.481481	0.481467	0.518530	26
3	21189	2008/08/22	2008/08/23	男	5	Los Angeles	CA	US	64.0	2014/03/31	...	0	34890	34890	372204	186100	0.434783	0.565217	0.551722	0.448275	12
4	39546	2009/04/10	2009/04/15	男	6	贵阳	贵州	CN	48.0	2014/03/31	...	22704	64969	64969	338813	210365	0.532895	0.467105	0.469054	0.530943	39

5 rows × 44 columns

import chardetf = open('air_data.csv', 'rb')
data_1 = f.read()
print(chardet.detect(data_1))

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

print(type(data))
# print(data.head())
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>MEMBER_NO                     0
FFP_DATE                      0
FIRST_FLIGHT_DATE             0
GENDER                        3
FFP_TIER                      0
WORK_CITY                  2269
WORK_PROVINCE              3248
WORK_COUNTRY                 26
AGE                         420
LOAD_TIME                     0
FLIGHT_COUNT                  0
BP_SUM                        0
EP_SUM_YR_1                   0
EP_SUM_YR_2                   0
SUM_YR_1                    551
SUM_YR_2                    138
SEG_KM_SUM                    0
WEIGHTED_SEG_KM               0
LAST_FLIGHT_DATE              0
AVG_FLIGHT_COUNT              0
AVG_BP_SUM                    0
BEGIN_TO_FIRST                0
LAST_TO_END                   0
AVG_INTERVAL                  0
MAX_INTERVAL                  0
ADD_POINTS_SUM_YR_1           0
ADD_POINTS_SUM_YR_2           0
EXCHANGE_COUNT                0
avg_discount                  0
P1Y_Flight_Count              0
L1Y_Flight_Count              0
P1Y_BP_SUM                    0
L1Y_BP_SUM                    0
EP_SUM                        0
ADD_Point_SUM                 0
Eli_Add_Point_Sum             0
L1Y_ELi_Add_Points            0
Points_Sum                    0
L1Y_Points_Sum                0
Ration_L1Y_Flight_Count       0
Ration_P1Y_Flight_Count       0
Ration_P1Y_BPS                0
Ration_L1Y_BPS                0
Point_NotFlight               0
dtype: int64

data.describe()

	MEMBER_NO	FFP_TIER	AGE	FLIGHT_COUNT	BP_SUM	EP_SUM_YR_1	EP_SUM_YR_2	SUM_YR_1	SUM_YR_2	SEG_KM_SUM	...	ADD_Point_SUM	Eli_Add_Point_Sum	L1Y_ELi_Add_Points	Points_Sum	L1Y_Points_Sum	Ration_L1Y_Flight_Count	Ration_P1Y_Flight_Count	Ration_P1Y_BPS	Ration_L1Y_BPS	Point_NotFlight
count	62988.000000	62988.000000	62568.000000	62988.000000	62988.000000	62988.0	62988.000000	62437.000000	62850.000000	62988.000000	...	62988.000000	62988.000000	62988.000000	62988.0000	62988.000000	62988.000000	62988.000000	62988.000000	62988.000000	62988.000000
mean	31494.500000	4.102162	42.476346	11.839414	10925.081254	0.0	265.689623	5355.376064	5604.026014	17123.878691	...	1355.006223	1620.695847	1080.378882	12545.7771	6638.739585	0.486419	0.513581	0.522293	0.468422	2.728155
std	18183.213715	0.373856	9.885915	14.049471	16339.486151	0.0	1645.702854	8109.450147	8703.364247	20960.844623	...	7868.477000	8294.398955	5639.857254	20507.8167	12601.819863	0.319105	0.319105	0.339632	0.338956	7.364164
min	1.000000	4.000000	6.000000	2.000000	0.000000	0.0	0.000000	0.000000	0.000000	368.000000	...	0.000000	0.000000	0.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	15747.750000	4.000000	35.000000	3.000000	2518.000000	0.0	0.000000	1003.000000	780.000000	4747.000000	...	0.000000	0.000000	0.000000	2775.0000	700.000000	0.250000	0.288889	0.258150	0.167954	0.000000
50%	31494.500000	4.000000	41.000000	7.000000	5700.000000	0.0	0.000000	2800.000000	2773.000000	9994.000000	...	0.000000	0.000000	0.000000	6328.5000	2860.500000	0.500000	0.500000	0.514252	0.476747	0.000000
75%	47241.250000	4.000000	48.000000	15.000000	12831.000000	0.0	0.000000	6574.000000	6845.750000	21271.250000	...	0.000000	345.000000	0.000000	14302.5000	7500.000000	0.711111	0.750000	0.815091	0.728375	1.000000
max	62988.000000	6.000000	110.000000	213.000000	505308.000000	0.0	74460.000000	239560.000000	234188.000000	580717.000000	...	984938.000000	984938.000000	728282.000000	985572.0000	728282.000000	1.000000	1.000000	0.999989	0.999993	140.000000

8 rows × 36 columns

data = data[data['SUM_YR_1'].notnull() & data['SUM_YR_2'].notnull()]
data_1 = data['SUM_YR_1'] != 0
data_2 = data['SUM_YR_2'] != 0
data_3 = (data['SEG_KM_SUM'] == 0) & (data['avg_discount'] == 0)
data = data[data_1 | data_2 | data_3] #该规则是“或”

data.head()
data.describe()

	MEMBER_NO	FFP_TIER	AGE	FLIGHT_COUNT	BP_SUM	EP_SUM_YR_1	EP_SUM_YR_2	SUM_YR_1	SUM_YR_2	SEG_KM_SUM	...	ADD_Point_SUM	Eli_Add_Point_Sum	L1Y_ELi_Add_Points	Points_Sum	L1Y_Points_Sum	Ration_L1Y_Flight_Count	Ration_P1Y_Flight_Count	Ration_P1Y_BPS	Ration_L1Y_BPS	Point_NotFlight
count	62044.000000	62044.000000	61632.000000	62044.000000	62044.000000	62044.0	62044.000000	62044.000000	62044.000000	62044.000000	...	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000	62044.000000
mean	31485.237928	4.103652	42.504300	11.971359	11057.772468	0.0	269.732093	5389.298164	5676.826688	17321.694749	...	1367.336729	1637.068822	1092.360228	12694.841290	6726.731416	0.489666	0.510334	0.519388	0.471873	2.754191
std	18188.650537	0.376322	9.885877	14.110619	16424.944888	0.0	1657.846655	8123.849287	8736.092628	21052.728111	...	7906.967903	8336.741957	5671.520660	20617.694168	12671.910738	0.317091	0.317091	0.337879	0.337318	7.399359
min	1.000000	4.000000	6.000000	2.000000	0.000000	0.0	0.000000	0.000000	0.000000	368.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	15715.750000	4.000000	35.000000	3.000000	2599.000000	0.0	0.000000	1024.000000	856.000000	4874.000000	...	0.000000	0.000000	0.000000	2856.000000	763.000000	0.250000	0.285714	0.256762	0.179310	0.000000
50%	31476.500000	4.000000	41.000000	7.000000	5816.000000	0.0	0.000000	2832.000000	2838.000000	10200.000000	...	0.000000	0.000000	0.000000	6457.000000	2929.500000	0.500000	0.500000	0.511677	0.480269	0.000000
75%	47247.250000	4.000000	48.000000	15.000000	13002.250000	0.0	0.000000	6617.000000	6928.000000	21522.500000	...	0.000000	381.000000	0.000000	14478.250000	7609.000000	0.714286	0.750000	0.804233	0.730684	1.000000
max	62988.000000	6.000000	110.000000	213.000000	505308.000000	0.0	74460.000000	239560.000000	234188.000000	580717.000000	...	984938.000000	984938.000000	728282.000000	985572.000000	728282.000000	1.000000	1.000000	0.999989	0.999993	140.000000

8 rows × 36 columns

data.to_csv('air_data_cleaned.csv') # 导出结果

data1 = pd.DataFrame(columns=['R', 'L', 'F', 'M', 'C'])
# b['time_interval']=pd.to_datetime(b['xxx'])-pd.to_datetime(b['xxx'])
data1['L'] = pd.to_datetime(data.LOAD_TIME)-pd.to_datetime(data.FFP_DATE)
data1['R'] = data.LAST_TO_END
data1['M'] = data.SEG_KM_SUM
data1['C'] = data.avg_discount
data1['F'] = data.FLIGHT_COUNT

data1.info()
data1['L'] = data1['L']/np.timedelta64(1, 'D') # 将日期转化为数值，单位为天

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62044 entries, 0 to 62978
Data columns (total 5 columns):
R    62044 non-null int64
L    62044 non-null timedelta64[ns]
F    62044 non-null int64
M    62044 non-null int64
C    62044 non-null float64
dtypes: float64(1), int64(3), timedelta64[ns](1)
memory usage: 2.8 MB

data2 = (data1-data1.mean(axis=0))/(data1.std(axis=0)) #标准差标准化

from sklearn.cluster import KMeans

k=5
kmodel = KMeans(n_clusters = k, n_jobs=4)
kmodel.fit(data2)print(kmodel.cluster_centers_)
print(kmodel.labels_)

[[-0.37722119  1.16066672 -0.08691852 -0.09484404 -0.1559046 ][ 1.68625847 -0.31367829 -0.57401599 -0.53682019 -0.1733261 ][-0.41488827 -0.70020646 -0.16114258 -0.16095751 -0.25513154][-0.00266813  0.05184279 -0.22680311 -0.23125407  2.19134701][-0.79938326  0.48332845  2.4832016   2.42472391  0.30863003]]
[4 4 4 ... 2 1 1]

labels = kmodel.labels_
df1 = pd.DataFrame(labels, columns = ['numbers'])
df2 = pd.DataFrame(kmodel.cluster_centers_, columns=data1.columns)
df3 = df1['numbers'].value_counts()
df4 = pd.concat([df3, df2], axis=1)
# df4

df4

	numbers	R	L	F	M	C
0	15740	-0.377221	1.160667	-0.086919	-0.094844	-0.155905
1	12125	1.686258	-0.313678	-0.574016	-0.536820	-0.173326
2	24659	-0.414888	-0.700206	-0.161143	-0.160958	-0.255132
3	4184	-0.002668	0.051843	-0.226803	-0.231254	2.191347
4	5336	-0.799383	0.483328	2.483202	2.424724	0.308630

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname="C:\Windows\Fonts\simhei.ttf")def plot_radar(data):'''the first column is the number of each cluster;the last are those to describe the center of each cluster.'''kinds = list(df4.index)kinds1=['客户群1', '客户群2', '客户群3', '客户群4', '客户群5']labels = data.iloc[:, 1:].columnssam = ['r-.', 'o-.', 'g--', 'b-', 'p:'] # 样式centers = pd.concat([data.iloc[:, 1:], data.iloc[:,1]], axis=1)  # 形成闭环centers = np.array(centers)n = len(labels)angles = np.linspace(0, 2*np.pi, n, endpoint=False)angles = np.concatenate((angles, [angles[0]])) fig = plt.figure(figsize=(7,7),dpi=130)ax = fig.add_subplot(111, polar=True) # 设置坐标为极坐标# 画若干个五边形floor = np.floor(centers.min())     # 大于最小值的最大整数ceil = np.ceil(centers.max())       # 小于最大值的最小整数for i in np.arange(floor, ceil + 0.5, 0.5):ax.plot(angles, [i] * (n + 1), '--', lw=0.5 , color='black')# 画不同客户群的分割线for i in range(n):ax.plot([angles[i], angles[i]], [floor, ceil], '--', lw=0.5, color='black')# 画不同的客户群所占的大小for i in range(len(kinds)):ax.plot(angles, centers[i], sam[i], lw=2, label=kinds1[i])ax.fill(angles, centers[i], alpha=0.25)  # ax.set_thetagrids(angles * 180 / np.pi, labels) # 设置显示的角度，将弧度转换为角度plt.title('客户群特征分布图', font_properties=my_font) # 添加标题plt.legend(prop=my_font, loc='lower right', bbox_to_anchor=(1.5, 0.0)) # 设置图例的位置，在画布外ax.set_theta_zero_location('N')        # 设置极坐标的起点（即0°）在正北方向，即相当于坐标轴逆时针旋转90°ax.spines['polar'].set_visible(False)  # 不显示极坐标最外圈的圆ax.set_ylim(floor,ceil) # 设置雷达图的范围 不设置中心就变成了五星为中心ax.grid(False) # 不显示默认的分割线ax.set_yticks( [i for i in np.arange(floor, ceil, 0.5)])                      # 显示坐标间隔plt.show()