Hallo,各位小伙伴大家好啊!这个专栏是用来分享数据处理以及数据可视化的一些常见操作,以及自己的一些学习笔记,希望能给大家带来帮助呀!感兴趣的小伙伴也欢迎私信或者评论区交流呀!
以下可视化的数据来源为“transcript.xlsx”成绩单文件,他有六列,分别是学生姓名、班级、数学科目得分、体育科目得分、python语言得分、美学科目得分。基于这个数据表,进行数据处理和可视化操作。
柱形图,又称长条图、柱状统计图亦称条图条状图、棒形图,是一种以长方形的长度为变量的统计图表。长条图用来比较两个或以上的价值(不同时间或者不同条件),只有一个变量,通常利用于较小的数据集分析。长条图亦可横向排列,或用多维方式表达。
代码如下:
- import pandas as pd #导入pandas库
- import matplotlib.pyplot as plt
- import numpy as np
- plt.rcParams['font.sans-serif']=['SimHei']
- plt.rcParams['axes.unicode_minus']=False
-
- excel_file = 'transcript.xlsx' #导入文件
- data = pd.read_excel(excel_file) #读入数据
-
- def func(type):
- type_class = data.loc[data['class'] == type]
- class_math = type_class["math"]
- class_physics = type_class["physics"]
- class_python = type_class["python"]
- class_aesthetics = type_class["aesthetics"]
- type_dic = {}
- type_dic["math"] = sum(class_math)/len(class_math)
- type_dic["physics"] = sum(class_physics)/len(class_physics)
- type_dic["python"] = sum(class_python)/len(class_python)
- type_dic["aesthetics"] = sum(class_aesthetics)/len(class_aesthetics)
- return type_dic
-
- dic_A = func("A")
- dic_B = func("B")
- dic_C = func("C")
- dic_D = func("D")
- y1 = list(dic_A.values())
- y2 = list(dic_B.values())
- y3 = list(dic_C.values())
- y4 = list(dic_D.values())
- x = np.arange(len(y1))
-
- #设置柱状图的宽度
- width = 0.1
- #绘图
- plt.figure(figsize=(8,4))
- plt.bar(x=x,height=y1,width=width,label='math')
- plt.bar(x=x+width,height=y2,width=width,label='physics')
- plt.bar(x=x+2*width,height=y3,width=width,label='python')
- plt.bar(x=x+3*width,height=y4,width=width,label='aesthetics')
- plt.xlabel('平均成绩')
- plt.ylabel('班级')
-
- #添加图标题和图例
- a = [0,1,2,3]
- labels = ['A', 'B', 'C', 'D']
- plt.xticks(a,labels,rotation = 30)
- plt.title('各个班级四个科目平均成绩垂直柱形图')
- plt.legend(bbox_to_anchor=(0.1, 1))
- plt.show()
效果如下:
垂直堆叠柱状图是柱状图的一种,可以在较小的可视化应用空间内,智能地展示多维的数据差异。支持自定义y轴区间、多系列数据配置以及堆叠式的数据展示。
代码如下:
- import pandas as pd #导入pandas库
- import matplotlib.pyplot as plt
- import numpy as np
- plt.rcParams['font.sans-serif']=['SimHei']
- plt.rcParams['axes.unicode_minus']=False
-
- excel_file = 'transcript.xlsx' #导入文件
- data = pd.read_excel(excel_file) #读入数据
-
- def func(type):
- type_class = data.loc[data['class'] == type]
- class_math = type_class["math"]
- class_physics = type_class["physics"]
- class_python = type_class["python"]
- class_aesthetics = type_class["aesthetics"]
- type_dic = {}
- type_dic["math"] = sum(class_math)/len(class_math)
- type_dic["physics"] = sum(class_physics)/len(class_physics)
- type_dic["python"] = sum(class_python)/len(class_python)
- type_dic["aesthetics"] = sum(class_aesthetics)/len(class_aesthetics)
- return type_dic
-
- dic_A = func("A")
- dic_B = func("B")
- dic_C = func("C")
- dic_D = func("D")
- y1 = list(dic_A.values())
- y2 = list(dic_B.values())
- y3 = list(dic_C.values())
- y4 = list(dic_D.values())
- y_list = [y1,y2,y3,y4]
- x = np.arange(len(y1))
- width = 0.1
- plt.xlabel('班级')
- plt.ylabel('分数')
- def push(i):
- #设置柱状图的宽度
- if i == 0:
- plt.bar(x[i], y_list[i][0], alpha=0.7, width=0.3, color='green',label = "math")
- plt.bar(x[i], y_list[i][1], alpha=0.7, width=0.3, color='red',bottom=y_list[i][0],label = "physics")
- plt.bar(x[i], y_list[i][2], alpha=0.7, width=0.3, color='black',bottom=y_list[i][1],label = "python")
- plt.bar(x[i], y_list[i][3], alpha=0.7, width=0.3, color='yellow',bottom=y_list[i][2],label = "aesthetics")
- else:
- plt.bar(x[i], y_list[i][0], alpha=0.7, width=0.3, color='green')
- plt.bar(x[i], y_list[i][1], alpha=0.7, width=0.3, color='red', bottom=y_list[i][0])
- plt.bar(x[i], y_list[i][2], alpha=0.7, width=0.3, color='black', bottom=y_list[i][1])
- plt.bar(x[i], y_list[i][3], alpha=0.7, width=0.3, color='yellow', bottom=y_list[i][2])
- push(0)
- push(1)
- push(2)
- push(3)
-
- a = [0,1,2,3]
- labels = ['A', 'B', 'C', 'D']
- plt.xticks(a,labels,rotation = 30)
- plt.legend(bbox_to_anchor=(0.30, 0.75))
- plt.title('垂直堆叠柱形图')
- plt.show()
效果如下:
直方图又称质量分布图,是一种统计报告图,它是根据具体数据的分布情况,画成以组距为底边、以频数为高度的一系列连接起来的直方型矩形图。
代码如下:
- import pandas as pd #导入pandas库
- import matplotlib.pyplot as plt
- import numpy as np
- plt.rcParams['font.sans-serif']=['SimHei']
- plt.rcParams['axes.unicode_minus']=False
-
- excel_file = 'transcript.xlsx' #导入文件
- data = pd.read_excel(excel_file) #读入数据
- math = data["math"]
- physics = data["physics"]
- python = data["python"]
- aesthetics = data["aesthetics"]
-
- # 计算组数
- def histo(a,subject):
- d = 3 # 组距
- num_bins = (max(a) - min(a)) // d
- # 设置图形大小
- plt.figure(figsize=(20, 8), dpi=80)
- plt.hist(a, num_bins)
- # 设置x轴刻度
- plt.xticks(range(min(a), max(a) + d, d))
- # 设置网格
- plt.grid(alpha=0.4)
- plt.ylabel(subject+"分数",fontsize=60)
- plt.title(subject+"科目直方图",fontsize=60)
- # print(math)
- histo(math,"math")
- histo(physics,"physics")
- histo(python,"python")
- histo(aesthetics,"aesthetics")
- plt.show()
效果如下:
箱线图是用来表示一组或多组连续型数据分布的中心位置和散布范围的图形,因形似箱子故取名为箱线图。
代码如下:
按各个科目的分数分布箱线图:
- import pandas as pd
- import matplotlib.pyplot as plt
-
- # 读取excel文件
- file_01 = pd.read_excel("transcript.xlsx")
-
- fig = plt.figure(figsize=(16, 8))
- d1 = file_01['math']
- d2 = file_01['physics']
- d3 = file_01['python']
- d4 = file_01['aesthetics']
-
- label = 'math', 'physics', 'python', 'aesthetics'
- plt.boxplot([d1, d2, d3, d4], labels=label) # label设置横轴每个箱图对应的横坐标
- plt.xticks(fontproperties='KaiTi')
- plt.xlabel('变量', fontproperties='KaiTi',fontsize=40)
- plt.ylabel('变量值', fontproperties='KaiTi',fontsize=40)
- plt.show()
按班级的科目分数分布箱线图:
- import pandas as pd
- import matplotlib.pyplot as plt
-
- # 读取excel文件
- data = pd.read_excel("transcript.xlsx")
- def func(type):
- type_class = data.loc[data['class'] == type]
- d1 = type_class["math"]
- d2 = type_class["physics"]
- d3 = type_class["python"]
- d4 = type_class["aesthetics"]
- label = 'math', 'physics', 'python', 'aesthetics'
- plt.boxplot([d1, d2, d3, d4], labels=label) # label设置横轴每个箱图对应的横坐标
- plt.xticks(fontproperties='KaiTi')
- plt.xlabel('变量', fontproperties='KaiTi', fontsize=20)
- plt.ylabel('变量值', fontproperties='KaiTi', fontsize=20)
- plt.show()
- func("A")
- func("B")
- func("C")
- func("D")
效果如下: