Python基础算法库及可视化库使用实践

编程入门行业动态更新时间:2024-10-08 19:48:19

Python基础<a href=https://www.elefans.com/category/jswz/34/1770096.html style= 算法库及可视化库使用实践"/>

Python基础算法库及可视化库使用实践

1 Numpy详细使用

读取txt文件

  import numpyworld_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",")print(type(world_alcohol))world_alcohol = numpy.genfromtxt("world_alcohol.txt", delimiter=",", dtype="U75", skip_header=1)print(world_alcohol)[[u'1986' u'Western Pacific' u'Viet Nam' u'Wine' u'0'][u'1986' u'Americas' u'Uruguay' u'Other' u'0.5'][u'1985' u'Africa' u"Cte d'Ivoire" u'Wine' u'1.62']..., [u'1987' u'Africa' u'Malawi' u'Other' u'0.75'][u'1989' u'Americas' u'Bahamas' u'Wine' u'1.5'][u'1985' u'Africa' u'Malawi' u'Spirits' u'0.31']]

创建一维和二维的Array数组

  #The numpy.array() function can take a list or list of lists as input. When we input a list, we get a one-dimensional array as a result:#一维的Array数组[]vector = numpy.array([5, 10, 15, 20])#二维的Array数组[[],[],[]]matrix = numpy.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])print vectorprint matrix

shape用法

  #We can use the ndarray.shape property to figure out how many elements are in the arrayvector = numpy.array([1, 2, 3, 4])print(vector.shape)#For matrices, the shape property contains a tuple with 2 elements.matrix = numpy.array([[5, 10, 15], [20, 25, 30]])print(matrix.shape)(4,)(2, 3)

dtype用法（numpy要求numpy.array内部元素结构相同）

  numbers = numpy.array([1, 2, 3, 4])numbers.dtypedtype('int32')#改变其中一个值时，其他值都会改变numbers = numpy.array([1, 2, 3, '4'])print(numbers)numbers.dtype['1' '2' '3' '4']dtype('<U11')

索引定位

  [[u'1986' u'Western Pacific' u'Viet Nam' u'Wine' u'0'][u'1986' u'Americas' u'Uruguay' u'Other' u'0.5'][u'1985' u'Africa' u"Cte d'Ivoire" u'Wine' u'1.62']..., [u'1987' u'Africa' u'Malawi' u'Other' u'0.75'][u'1989' u'Americas' u'Bahamas' u'Wine' u'1.5'][u'1985' u'Africa' u'Malawi' u'Spirits' u'0.31']]uruguay_other_1986 = world_alcohol[1,4]third_country = world_alcohol[2,2]print uruguay_other_1986print third_country0.5Cte d'Ivoire

索引切片

  vector = numpy.array([5, 10, 15, 20])print(vector[0:3])  [ 5 10 15]

取某一列（：表示所有行）

  matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])print(matrix[:,1])[10 25 40]matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])print(matrix[:,0:2])[[ 5 10][20 25][35 40]]matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])print(matrix[1:3,0:2])[[20 25][35 40]]

版权声明：本套技术专栏是作者（秦凯新）平时工作的总结和升华，通过从真实商业环境抽取案例进行总结和分享，并给出商业应用的调优建议和集群环境容量规划等内容，请持续关注本套博客。QQ邮箱地址：1120746959@qq，如有任何学术交流，可随时联系。

对Array操作表示对内部所有元素进行操作

  import numpy#it will compare the second value to each element in the vector# If the values are equal, the Python interpreter returns True; otherwise, it returns Falsevector = numpy.array([5, 10, 15, 20])vector == 10array([False,  True, False, False], dtype=bool)matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])matrix == 25array([[False, False, False],[False,  True, False],[False, False, False]], dtype=bool)

布尔值当索引（[False True False False]）

  vector = numpy.array([5, 10, 15, 20])equal_to_ten = (vector == 10)print equal_to_tenprint(vector[equal_to_ten])[False  True False False][10]#矩阵表示索引matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])second_column_25 = (matrix[:,1] == 25)print second_column_25print(matrix[second_column_25, :])[False  True False][[20 25 30]]

对数组进行与运算

  #We can also perform comparisons with multiple conditionsvector = numpy.array([5, 10, 15, 20])equal_to_ten_and_five = (vector == 10) & (vector == 5)print equal_to_ten_and_five[False False False False]vector = numpy.array([5, 10, 15, 20])equal_to_ten_or_five = (vector == 10) | (vector == 5)print equal_to_ten_or_five[ True  True False False]

值类型转换

  vector = numpy.array(["1", "2", "3"])print vector.dtypeprint vectorvector = vector.astype(float)print vector.dtypeprint vector|S1['1' '2' '3']float64[ 1.  2.  3.]

聚合求解

  vector = numpy.array([5, 10, 15, 20])vector.sum()

按行维度（axis=1）

 matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])matrix.sum(axis=1)array([ 30,  75, 120])

按列求和（axis=0）

  matrix = numpy.array([[5, 10, 15], [20, 25, 30],[35, 40, 45]])matrix.sum(axis=0)

矩阵操作np.arange生成0-N的整数

  import numpy as npa = np.arange(15).reshape(3, 5)aarray([[ 0,  1,  2,  3,  4],[ 5,  6,  7,  8,  9],[10, 11, 12, 13, 14]])a.ndim2a.dtype.name'int32'a.size15

矩阵初始化

  np.zeros ((3,4)) array([[ 0.,  0.,  0.,  0.],[ 0.,  0.,  0.,  0.],[ 0.,  0.,  0.,  0.]])np.ones( (2,3,4), dtype=np.int32 )array([[[1, 1, 1, 1],[1, 1, 1, 1],[1, 1, 1, 1]],[[1, 1, 1, 1],[1, 1, 1, 1],[1, 1, 1, 1]]])

按照间隔生成数据

  np.arange( 10, 30, 5 )array([10, 15, 20, 25])np.arange( 0, 2, 0.3 )array([ 0. ,  0.3,  0.6,  0.9,  1.2,  1.5,  1.8])

随机生成数据

  np.random.random((2,3))array([[ 0.40130659,  0.45452825,  0.79776512],[ 0.63220592,  0.74591134,  0.64130737]])

linspace在0到2pi之间取100个数

  from numpy import pinp.linspace( 0, 2*pi, 100 )array([ 0.    ,  0.06346652,  0.12693304,  0.19039955,  0.25386607,0.31733259,  0.38079911,  0.44426563,  0.50773215,  0.57119866,0.63466518,  0.6981317 ,  0.76159822,  0.82506474,  0.88853126,0.95199777,  1.01546429,  1.07893081,  1.14239733,  1.20586385,1.26933037,  1.33279688,  1.3962634 ,  1.45972992,  1.52319644,1.58666296,  1.65012947,  1.71359599,  1.77706251,  1.84052903,1.90399555,  1.96746207,  2.03092858,  2.0943951 ,  2.15786162,2.22132814,  2.28479466,  2.34826118,  2.41172769,  2.47519421,2.53866073,  2.60212725,  2.66559377,  2.72906028,  2.7925268 ,2.85599332,  2.91945984,  2.98292636,  3.04639288,  3.10985939,3.17332591,  3.23679243,  3.30025895,  3.36372547,  3.42719199,3.4906585 ,  3.55412502,  3.61759154,  3.68105806,  3.74452458,3.8079911 ,  3.87145761,  3.93492413,  3.99839065,  4.06185717,4.12532369,  4.1887902 ,  4.25225672,  4.31572324,  4.37918976,4.44265628,  4.5061228 ,  4.56958931,  4.63305583,  4.69652235,4.75998887,  4.82345539,  4.88692191,  4.95038842,  5.01385494,5.07732146,  5.14078798,  5.2042545 ,  5.26772102,  5.33118753,5.39465405,  5.45812057,  5.52158709,  5.58505361,  5.64852012,5.71198664,  5.77545316,  5.83891968,  5.9023862 ,  5.96585272,6.02931923,  6.09278575,  6.15625227,  6.21971879,  6.28318531])

矩阵基本操作

  #the product operator * operates elementwise in NumPy arraysa = np.array( [20,30,40,50] )b = np.arange( 4 )print (a)print (b)#bc = a-bprint (c)b**2print (b**2)print (a<35)[20 30 40 50][0 1 2 3][20 29 38 47][ True  True False False]

矩阵相乘

  #The matrix product can be performed using the dot function or methodA = np.array([[1,1],[0,1]] )B = np.array([[2,0],[3,4]])print (A)print (B)print (A*B)print (A.dot(B))print (np.dot(A, B) )[[1 1][0 1]][[2 0][3 4]][[2 0][0 4]][[5 4][3 4]][[5 4][3 4]]

矩阵操作floor向下取整

  import numpy as npB = np.arange(3)print (B)#print np.exp(B)print (np.sqrt(B))[0 1 2][0.         1.         1.41421356]#Return the floor of the inputa = np.floor(10*np.random.random((3,4)))#print a#Return the floor of the inputa = np.floor(10*np.random.random((3,4)))print (a)print(a.reshape(2,-1))[[0. 4. 2. 2.][8. 1. 5. 7.][0. 9. 7. 4.]][[0. 4. 2. 2. 8. 1.][5. 7. 0. 9. 7. 4.]]

hstack矩阵拼接

  a = np.floor(10*np.random.random((2,2)))b = np.floor(10*np.random.random((2,2)))print aprint '---'print bprint '---'print np.hstack((a,b))[[ 5.  6.][ 1.  5.]]---[[ 8.  6.][ 9.  0.]]---[[ 5.  6.  8.  6.][ 1.  5.  9.  0.]]a = np.floor(10*np.random.random((2,2)))b = np.floor(10*np.random.random((2,2)))print (a)print ('---')print (b)print ('---')#print np.hstack((a,b))np.vstack((a,b))[[7. 7.][2. 6.]]---[[0. 6.][0. 3.]]---array([[1., 0.],[3., 6.],[4., 2.],[8., 7.]])a = np.floor(10*np.random.random((2,12)))print (a)print (np.hsplit(a,3))[[6. 5. 2. 4. 2. 4. 9. 4. 4. 6. 8. 9.][8. 4. 0. 2. 6. 5. 2. 5. 0. 4. 1. 6.]][array([[6., 5., 2., 4.],[8., 4., 0., 2.]]), array([[2., 4., 9., 4.],[6., 5., 2., 5.]]), array([[4., 6., 8., 9.],[0., 4., 1., 6.]])]

任意选择切分位置

  print ( np.hsplit(a,(3,4)))   # Split a after the third and the fourth column[[2. 8. 4.    7.    6. 6. 5. 8. 8. 3. 0. 1.][3. 5. 9.    4.    5. 8. 7. 6. 2. 3. 8. 4.]][array([[2., 8., 4.],[3., 5., 9.]]), array([[7.],[4.]]), array([[6., 6., 5., 8., 8., 3., 0., 1.],[5., 8., 7., 6., 2., 3., 8., 4.]])]

变量赋值
变量视图
版权声明：本套技术专栏是作者（秦凯新）平时工作的总结和升华，通过从真实商业环境抽取案例进行总结和分享，并给出商业应用的调优建议和集群环境容量规划等内容，请持续关注本套博客。QQ邮箱地址：1120746959@qq，如有任何学术交流，可随时联系。

copy实现变量之间没有关系

  d = a.copy() d is ad[0,0] = 9999print d print a[[9999    1    2    3][1234    5    6    7][   8    9   10   11]][[   0    1    2    3][1234    5    6    7][   8    9   10   11]]

寻找列最大值索引

行列按照倍数扩展（行3倍列5倍）

  a = np.arange(0, 40, 10)b = np.tile(a, (3, 5)) print b[[ 0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30][ 0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30][ 0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30]]

按照元素大小排序并给出索引值

  a = np.array([4, 3, 1, 2])j = np.argsort(a)print jprint a[j][2 3 1 0][1 2 3 4]

对数组按照元素大小排序

  a = np.array([[4, 3, 5], [1, 2, 1]])#print ab = np.sort(a, axis=1)print (b)[[3 4 5][1 1 2]]

2 Pandas详细使用（底层基于Numpy）

2.1 Pandas基本操作

Pandas核心结构（DataFrame）
Pandas 字符型表示为Object
Pandas数据基本类型展示

    import pandasfood_info = pandas.read_csv("food_info.csv")print(type(food_info))<class 'pandas.core.frame.DataFrame'>col_names = food_info.columns.tolist()['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)','Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)','Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)','Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)','Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE','Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)','FA_Poly_(g)', 'Cholestrl_(mg)']    print food_info.dtypesNDB_No               int64Shrt_Desc           objectWater_(g)          float64Energ_Kcal           int64Protein_(g)        float64Lipid_Tot_(g)      float64Ash_(g)            float64Carbohydrt_(g)     float64Fiber_TD_(g)       float64Sugar_Tot_(g)      float64Calcium_(mg)       float64Iron_(mg)          float64Magnesium_(mg)     float64Phosphorus_(mg)    float64Potassium_(mg)     float64Sodium_(mg)        float64Zinc_(mg)          float64Copper_(mg)        float64Manganese_(mg)     float64Selenium_(mcg)     float64Vit_C_(mg)         float64Thiamin_(mg)       float64Riboflavin_(mg)    float64Niacin_(mg)        float64Vit_B6_(mg)        float64Vit_B12_(mcg)      float64Vit_A_IU           float64Vit_A_RAE          float64Vit_E_(mg)         float64Vit_D_mcg          float64Vit_D_IU           float64Vit_K_(mcg)        float64FA_Sat_(g)         float64FA_Mono_(g)        float64FA_Poly_(g)        float64Cholestrl_(mg)     float64dtype: object

Pandas基本操作

  #可以指定数量#first_rows = food_info.head()#print(food_info.head(3))

    #print food_info.columns

    #print food_info.shape（8618,36）

取数据操作

  #pandas uses zero-indexing#Series object representing the row at index 0.#print food_info.loc[0]# Series object representing the seventh row.#food_info.loc[6]# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"#food_info.loc[8620]#The object dtype is equivalent to a string in Python

数据切片

  # Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.#food_info.loc[3:6]# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.# Method 1#two_five_ten = [2,5,10] #food_info.loc[two_five_ten]# Method 2#food_info.loc[[2,5,10]]

通过列名取出数据

  # Series object representing the "NDB_No" column.#ndb_col = food_info["NDB_No"]#print ndb_col# Alternatively, you can access a column by passing in a string variable.#col_name = "NDB_No"#ndb_col = food_info[col_name]

取出两个列的值

  #columns = ["Zinc_(mg)", "Copper_(mg)"]#zinc_copper = food_info[columns]#print zinc_copper#print zinc_copper# Skipping the assignment.#zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]

endswith 定位取值

  #print(food_info.columns)#print(food_info.head(2))col_names = food_info.columns.tolist()#print col_namesgram_columns = []for c in col_names:if c.endswith("(g)"):gram_columns.append(c)gram_df = food_info[gram_columns]print(gram_df.head(3))

2.2 Series类型上场

Series 是一个带有名称和索引的一维数组，既然是数组，肯定要说到的就是数组中的元素类型，在 Series 中包含的数据类型可以是整数、浮点、字符串、Python对象等。

版权声明：本套技术专栏是作者（秦凯新）平时工作的总结和升华，通过从真实商业环境抽取案例进行总结和分享，并给出商业应用的调优建议和集群环境容量规划等内容，请持续关注本套博客。QQ邮箱地址：1120746959@qq，如有任何学术交流，可随时联系。
# 存储了 4 个年龄：18/30/25/40
user_age = pd.Series(data=[18, 30, 25, 40])
user_age
```
  0    181    302    253    40dtype: int64
```

指定索引

  user_age.index = ["Tom", "Bob", "Mary", "James"]user_ageTom      18Bob      30Mary     25James    40dtype: int64

为 index 起个名字

  user_age.index.name = "name"user_agenameTom      18Bob      30Mary     25James    40dtype: int64

给 Series 起个名字

  user_age.name="user_age_info"user_agenameTom      18Bob      30Mary     25James    40Name: user_age_info, dtype: int64

一个 Series 包括了 data、index 以及 name。

  # 构建索引name = pd.Index(["Tom", "Bob", "Mary", "James"], name="name")# 构建 Seriesuser_age = pd.Series(data=[18, 30, 25, 40], index=name, name="user_age_info")user_agenameTom      18Bob      30Mary     25James    40Name: user_age_info, dtype: int64# 指定类型为浮点型user_age = pd.Series(data=[18, 30, 25, 40], index=name, name="user_age_info", dtype=float)user_agenameTom      18.0Bob      30.0Mary     25.0James    40.0Name: user_age_info, dtype: float64

Series 包含了 dict 的特点，也就意味着可以使用与 dict 类似的一些操作。我们可以将 index 中的元素看成是 dict 中的 key。

 # 获取 Tom 的年龄user_age["Tom"]18.0user_age.get("Tom")18.0# 指定索引，获取第一个元素user_age[0]18.0# 获取前三个元素user_age[:3]nameTom     18.0Bob     30.0Mary    25.0Name: user_age_info, dtype: float64# 获取年龄大于30的元素user_age[user_age > 30]nameJames    40.0Name: user_age_info, dtype: float64# 获取第4个和第二个元素user_age[[3, 1]]nameJames    40.0Bob      30.0Name: user_age_info, dtype: float64

2.3 DataFrame隆重登场

DataFrame 是一个带有索引的二维数据结构，每列可以有自己的名字，并且可以有不同的数据类型。你可以把它想象成一个 excel 表格或者数据库中的一张表，DataFrame 是最常用的 Pandas 对象。
```
  index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name")data = {"age": [18, 30, 25, 40],"city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"]}user_info = pd.DataFrame(data=data, index=index)user_info
```

通过索引名来访问某行，这种办法需要借助 loc 方法

   user_info.loc["Tom"]age          18city    BeiJingName: Tom, dtype: object

通过这行所在的位置来选择这一行

  user_info.iloc[0]age          18city    BeiJingName: Tom, dtype: object

如何访问多行
```
  user_info.iloc[1:3]
```

访问列

  user_info.agenameTom      18Bob      30Mary     25James    40Name: age, dtype: int64user_info["age"]nameTom      18Bob      30Mary     25James    40Name: age, dtype: int64#可以变换列的顺序user_info[["city", "age"]]

3 matplotlib使用实践

折线图

  import pandas as pdunrate = pd.read_csv("C:\\ML\\MLData\\unrate.csv")unrate['DATE'] = pd.to_datetime(unrate['DATE'])print(unrate.head(12))DATE      VALUE0  1948-01-01    3.41  1948-02-01    3.82  1948-03-01    4.03  1948-04-01    3.94  1948-05-01    3.55  1948-06-01    3.66  1948-07-01    3.67  1948-08-01    3.98  1948-09-01    3.89  1948-10-01    3.710 1948-11-01    3.811 1948-12-01    4.0import matplotlib.pyplot as pltplt.plot()plt.show()

    first_twelve = unrate[0:12]plt.plot(first_twelve['DATE'], first_twelve['VALUE'])plt.show()

    plt.plot(first_twelve['DATE'], first_twelve['VALUE'])plt.xticks(rotation=45)#print help(plt.xticks)plt.show()

#xlabel(): accepts a string value, which gets set as the x-axis label.
#ylabel(): accepts a string value, which is set as the y-axis label.
#title(): accepts a string value, which is set as the plot title.plt.plot(first_twelve['DATE'], first_twelve['VALUE'])
plt.xticks(rotation=90)
plt.xlabel('Month')
plt.ylabel('Unemployment Rate')
plt.title('Monthly Unemployment Trends, 1948')
plt.show()

多条折线图展示

  fig = plt.figure(figsize=(10,6))colors = ['red', 'blue', 'green', 'orange', 'black']for i in range(5):start_index = i*12end_index = (i+1)*12subset = unrate[start_index:end_index]label = str(1948 + i)plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i], label=label)plt.legend(loc='upper left')plt.xlabel('Month, Integer')plt.ylabel('Unemployment Rate, Percent')plt.title('Monthly Unemployment Trends, 1948-1952')plt.show()

柱状图竖型展示

  import pandas as pdreviews = pd.read_csv('C:\\ML\\MLData\\fandango_scores.csv')cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']norm_reviews = reviews[cols]print(type(reviews))#打印出第一行print(norm_reviews[:1])<class 'pandas.core.frame.DataFrame'>

    import matplotlib.pyplot as pltfrom numpy import arange#取出第一行指定列num_cols的数据num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']bar_heights = norm_reviews.loc[0, num_cols].valuesprint(bar_heights)[4.3 3.55 3.9 4.5 5.0]bar_heights = norm_reviews.loc[0, num_cols].values#横轴位置bar_positions = arange(5) + 0.75#横轴标识的位置（1到6之间）tick_positions = range(1,6)fig, ax = plt.subplots()#0.5标识柱状图宽度ax.bar(bar_positions, bar_heights, 0.5)ax.set_xticks(tick_positions)ax.set_xticklabels(num_cols, rotation=90)ax.set_xlabel('Rating Source')ax.set_ylabel('Average Rating')ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')plt.show()

柱状图横向表示

  import matplotlib.pyplot as pltfrom numpy import arangenum_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']bar_widths = norm_reviews.loc[0, num_cols].valuesbar_positions = arange(5) + 0.75#横轴标识名的位置（1到6之间）tick_positions = range(1,6)fig, ax = plt.subplots()ax.barh(bar_positions, bar_widths, 0.6)ax.set_yticks(tick_positions)ax.set_yticklabels(num_cols)ax.set_ylabel('Rating Source')ax.set_xlabel('Average Rating')ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')plt.show()

散点图

  #Switching Axesfig = plt.figure(figsize=(5,10))ax1 = fig.add_subplot(2,1,1)ax2 = fig.add_subplot(2,1,2)ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])ax1.set_xlabel('Fandango')ax1.set_ylabel('Rotten Tomatoes')ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue'])ax2.set_xlabel('Rotten Tomatoes')ax2.set_ylabel('Fandango')plt.show()

Hist的bins区间统计

import pandas as pd
import matplotlib.pyplot as plt
reviews = pd.read_csv('C:\\ML\\MLData\\fandango_scores.csv')
cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
norm_reviews = reviews[cols]
print(norm_reviews[:5])#按照列进行分组聚合
fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()
fandango_distribution = fandango_distribution.sort_index()#按照列进行分组聚合
imdb_distribution = norm_reviews['IMDB_norm'].value_counts()
imdb_distribution = imdb_distribution.sort_index()print(fandango_distribution)
2.7     2
2.8     2
2.9     5
3.0     4
3.1     3
3.2     5
3.3     4
3.4     9
3.5     9
3.6     8
3.7     9
3.8     5
3.9    12
4.0     7
4.1    16
4.2    12
4.3    11
4.4     7
4.5     9
4.6     4
4.8     3
Name: Fandango_Ratingvalue, dtype: int64print(imdb_distribution)
2.00     1
2.10     1
2.15     1
2.20     1
2.30     2
2.45     2
2.50     1
2.55     1
2.60     2
2.70     4
2.75     5
2.80     2
2.85     1
2.90     1
2.95     3
3.00     2
3.05     4
3.10     1
3.15     9
3.20     6
3.25     4
3.30     9
3.35     7
3.40     1
3.45     7
3.50     4
3.55     7
3.60    10
3.65     5
3.70     8
3.75     6
3.80     3
3.85     4
3.90     9
3.95     2
4.00     1
4.05     1
4.10     4
4.15     1
4.20     2
4.30     1
Name: IMDB_norm, dtype: int64fig, ax = plt.subplots()
#ax.hist(norm_reviews['Fandango_Ratingvalue'])
#ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)指定区间为20个，范围为4到5
ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)
plt.show()

4分图盒图

  num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']fig, ax = plt.subplots()#指定统计列取出对应值ax.boxplot(norm_reviews[num_cols].values)ax.set_xticklabels(num_cols, rotation=90)ax.set_ylim(0,5)plt.show()

4 Seaborn专业可视化库（基于matplot）

风格设置

  import seaborn as snsimport numpy as npimport matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inlinesns.set_style("whitegrid")data = np.random.normal(size=(20, 6)) + np.arange(6) / 2sns.boxplot(data=data)

     sns.set_style("dark")sinplot()sns.set_style("white")sinplot() sns.set_style("whitegrid")sns.boxplot(data=data, palette="deep")sns.despine(left=True)

调色板设置

  import numpy as npimport seaborn as snsimport matplotlib.pyplot as plt%matplotlib inlinesns.set(rc={"figure.figsize": (6, 6)})current_palette = sns.color_palette()sns.palplot(current_palette)

    6个默认的颜色循环主题： deep, muted, pastel, bright, dark, colorblindsns.palplot(sns.color_palette("hls", 8))

    data = np.random.normal(size=(20, 8)) + np.arange(8) / 2sns.boxplot(data=data,palette=sns.color_palette("hls", 8))data = np.random.normal(size=(20, 8)) + np.arange(8) / 2#print(data)sns.boxplot(data=data,palette=sns.color_palette("hls", 8))

区间直方图绘制（kde是否指定核密度估计）

  x = np.random.gamma(6, size=200)sns.distplot(x, kde=False, fit=stats.gamma)

线性回归1

  %matplotlib inlineimport numpy as npimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as pltimport seaborn as snssns.set(color_codes=True)np.random.seed(sum(map(ord, "regression")))tips = sns.load_dataset("tips")tips.head()

sns.regplot(x="total_bill", y="tip", data=tips)

线性回归2

  sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips);

多分类问题

  %matplotlib inlineimport numpy as npimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as pltimport seaborn as snssns.set(style="whitegrid", color_codes=True)np.random.seed(sum(map(ord, "categorical")))titanic = sns.load_dataset("titanic")tips = sns.load_dataset("tips")iris = sns.load_dataset("iris")sns.stripplot(x="day", y="total_bill", data=tips);

  sns.stripplot(x="day", y="total_bill", data=tips, jitter=True)

树桩展示均匀展示

  sns.swarmplot(x="day", y="total_bill", data=tips)

树桩展示均匀并分类展示

  sns.swarmplot(x="day", y="total_bill", hue="sex",data=tips)

盒图

 IQR即统计学概念四分位距，第一/四分位与第三/四分位之间的距离N = 1.5IQR 如果一个值>Q3+N或　<　Ｑ1-N,则为离群点#横杠最小值和最大值sns.boxplot(x="day", y="total_bill", hue="time", data=tips);

小提琴图（越胖包含的数据越多）

  sns.violinplot(x="day", y="total_bill", hue="sex", data=tips, split=True);

葫芦图

 sns.violinplot(x="day", y="total_bill", data=tips, inner=None)sns.swarmplot(x="day", y="total_bill", data=tips, color="w", alpha=.5)

柱状分类统计图

  sns.barplot(x="sex", y="survived", hue="class", data=titanic);

点图可以更好的描述变化差异

  sns.pointplot(x="sex", y="survived", hue="class", data=titanic);

    sns.pointplot(x="class", y="survived", hue="sex", data=titanic,palette={"male": "g", "female": "m"},markers=["^", "o"], linestyles=["-", "--"]);

多层面板分类图

  sns.factorplot(x="day", y="total_bill", hue="smoker", data=tips)

    sns.factorplot(x="day", y="total_bill", hue="smoker", data=tips, kind="bar")

    sns.factorplot(x="day", y="total_bill", hue="smoker",col="time", data=tips, kind="swarm")

    sns.factorplot(x="time", y="total_bill", hue="smoker",col="day", data=tips, kind="box", size=4, aspect=.5)

FacetGrid 多参数网格面板

  %matplotlib inlineimport numpy as npimport pandas as pdimport seaborn as snsfrom scipy import statsimport matplotlib as mplimport matplotlib.pyplot as pltsns.set(style="ticks")np.random.seed(sum(map(ord, "axis_grids")))tips = sns.load_dataset("tips")tips.head()

    g = sns.FacetGrid(tips, col="time")g.map(plt.hist, "tip");

    g = sns.FacetGrid(tips, col="sex", hue="smoker")g.map(plt.scatter, "total_bill", "tip", alpha=.7)g.add_legend();

    g = sns.FacetGrid(tips, row="smoker", col="time", margin_titles=True)g.map(sns.regplot, "size", "total_bill", color=".1", fit_reg=False, x_jitter=.1);

热力图

   %matplotlib inlineimport matplotlib.pyplot as pltimport numpy as np; np.random.seed(0)import seaborn as sns;sns.set()uniform_data = np.random.rand(3, 3)print (uniform_data)heatmap = sns.heatmap(uniform_data)[[ 0.0187898   0.6176355   0.61209572][ 0.616934    0.94374808  0.6818203 ][ 0.3595079   0.43703195  0.6976312 ]]

    ax = sns.heatmap(flights, linewidths=.5)

5 总结

方便复习，整成笔记，内容粗略，勿怪，待完善。

版权声明：本套技术专栏是作者（秦凯新）平时工作的总结和升华，通过从真实商业环境抽取案例进行总结和分享，并给出商业应用的调优建议和集群环境容量规划等内容，请持续关注本套博客。QQ邮箱地址：1120746959@qq，如有任何学术交流，可随时联系。
秦凯新于深圳
。

更多推荐

Python基础算法库及可视化库使用实践

本文发布于:2024-02-14 14:00:11，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1763677.html

算法基础 Python

发布评论取消回复

评论列表（有 0 条评论）

Python基础算法库及可视化库使用实践