考研数据集:数据集下载
运行工具:jupyter notebook
博文通过分析各大高校考研招生信息数据,看看:
希望对小伙伴们有所帮助,如有疑问或者需要改进的地方可以在评论区留言。
涉及到的库:
Pandas — 数据处理
Pyecharts — 数据可视化
可视化部分:
柱状图 — Bar
象形图 — PictorialBar
词云图 — stylecloud
组合组件 — Grid
!pip install stylecloud
import re
import stylecloud
from PIL import Image
import numpy as np
import pandas as pd
from collections import Counter
from pyecharts.charts import Line,PictorialBar,Bar,Grid
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from pyecharts.globals import SymbolType
from pyecharts.commons.utils import JsCode
df = pd.read_csv('/home/mw/input/202201106373/考研信息.csv')
df.head(10)
df.info()
可以看到,学校名称、院系名称、专业代码等列存在部分数据缺失
df.dropna(inplace=True)
df_2020 = df[df['年份'] == 2020]
df_2020.info()
可以看到,没有缺失数据了
df_2020.duplicated().sum()
df_2020.duplicated(subset=["学校名称","专业代码"]).sum()
df_2020 = df_2020[~(df_2020['总分']=='-')]
df_2020['总分'] = df_2020['总分'].astype('int')
df_2020.info()
df_major_10 = df_2020.groupby('学校名称')['专业名称'].count().sort_values(ascending = False)[:10].to_frame('数量')
df_major_10 = df_major_10 = df_major_10.sort_values(by='数量').sort_values(by='数量')
schs = list(df_major_10.index)
sch_icons = {
'武汉大学': 'image://https://www.shanghairanking.cn/_uni/logo/46182017.png',
'吉林大学': 'image://https://www.shanghairanking.cn/_uni/logo/76557044.png',
'厦门大学': 'image://https://www.shanghairanking.cn/_uni/logo/14008229.png',
'西南大学': 'image://https://www.shanghairanking.cn/_uni/logo/68012227.png',
'北京大学': 'image://https://www.shanghairanking.cn/_uni/logo/86350223.png',
'四川大学': 'image://https://www.shanghairanking.cn/_uni/logo/75651370.png',
'山东大学': 'image://https://www.shanghairanking.cn/_uni/logo/97189370.png',
'复旦大学': 'image://https://www.shanghairanking.cn/_uni/logo/28312850.png',
'云南大学': 'image://https://www.shanghairanking.cn/_uni/logo/31586909.png',
'南开大学': 'image://https://www.shanghairanking.cn/_uni/logo/44629152.png'
}
icons = []
for sch in schs:
icons.append(dict(name=sch, value=1, symbol=sch_icons[sch]))
p1 = (
PictorialBar()
.add_xaxis(schs)
.add_yaxis("",
icons,
label_opts=opts.LabelOpts(is_show=False),
category_gap='40%',
symbol_pos='start',
symbol_size=60,
is_symbol_clip=False,
itemstyle_opts={"normal": {
'shadowBlur': 10,
'shadowColor': 'rgba(0, 0, 0, 0.5)',
'shadowOffsetX': 10,
'shadowOffsetY': 10,}
})
.set_global_opts(
xaxis_opts=opts.AxisOpts(is_show=False),
yaxis_opts=opts.AxisOpts(
is_show=True,
is_scale=True,
axistick_opts=opts.AxisTickOpts(is_show=False),
axislabel_opts=opts.LabelOpts(font_size=20,color='#ed1941',font_weight=700,margin=20),
splitline_opts=opts.SplitLineOpts(is_show=False,
linestyle_opts=opts.LineStyleOpts(type_='dashed')),
axisline_opts=opts.AxisLineOpts(is_show=False,
linestyle_opts=opts.LineStyleOpts(width=2, color='#DB7093'))
),
)
.reversal_axis()
)
b1 = (
Bar()
.add_xaxis(schs)
.add_yaxis('', df_major_10['数量'].values.tolist(), category_gap='40%')
.set_series_opts(
label_opts=opts.LabelOpts(
position="insideLeft",
vertical_align='middle',
horizontal_align='top',
font_size=18,
font_weight='bold',
formatter=' {c} '),
itemstyle_opts={
'opacity': 0.9,
'shadowBlur': 10,
'shadowOffsetX': 10,
'shadowOffsetY': 10,
'shadowColor': 'rgba(0, 0, 0, 0.5)',
'barBorderRadius': [30, 30, 30, 30],
'color':'red'
}
)
.set_global_opts(
yaxis_opts=opts.AxisOpts(is_show=False),
xaxis_opts=opts.AxisOpts(
is_scale=True,
type_="value",
name_location="middle",
position='top',
name_textstyle_opts=opts.TextStyleOpts(font_size=14, font_weight='bold',),
axisline_opts=opts.AxisLineOpts(is_show=False),
axislabel_opts=opts.LabelOpts(is_show=False),
splitline_opts=opts.SplitLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),),
title_opts=opts.TitleOpts(title='招生专业数量前十的学校',
title_textstyle_opts=opts.TextStyleOpts(color="blue",font_size=30),
pos_top='2%',pos_left = 'center'),
)
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme='light', width='1000px', height='800px'))
grid.add(p1, is_control_axis_index=False, grid_opts=opts.GridOpts(pos_left='15%', pos_right='80%', pos_top='10%'))
grid.add(b1, is_control_axis_index=False, grid_opts=opts.GridOpts(pos_left='23%', pos_right='10%', pos_top='10%'))
grid.render_notebook()
df_tmp = df_2020[df_2020["专业名称"].str.contains('计算机科学与技术|工商管理|会计|管理科学与工程|金融|法律|材料科学与工程|公共管理|机械工程|数学')]
df_major = df_tmp.groupby('专业名称')['总分'].agg([np.mean, np.max,np.min])
df_major = df_major.sort_values(by=['mean'],ascending=False)[:20]
df_major['amin'] = df_major['amin']*(-1)
bar = (
Bar()
.add_xaxis(df_major.index.tolist())
.add_yaxis('最高分',
df_major['amax'].tolist()[::-1],
z_level=1,
stack='1',
category_gap='50%',
tooltip_opts=opts.TooltipOpts(is_show=False),
label_opts=opts.LabelOpts(position='insideLeft', formatter='{c} 分'),
itemstyle_opts={"normal": {
"barBorderRadius": [30, 30, 30, 30],
'shadowBlur': 10,
'shadowColor': 'rgba(120, 36, 50, 0.5)',
'shadowOffsetY': 5,
'color':'#a61e4d',
}
}
)
.add_yaxis('最低分',
df_major['amin'].tolist()[::-1],
z_level=1,
stack='1',
category_gap='50%',
tooltip_opts=opts.TooltipOpts(is_show=False),
label_opts=opts.LabelOpts(position='insideRight',
formatter=JsCode(
"""function(params) {
if (params.value && params.value < 0)
{
return -params.value + '分';
}
}"""
),
),
itemstyle_opts={"normal": {
"barBorderRadius": [30, 30, 30, 30],
'shadowBlur': 10,
'shadowColor': 'rgba(120, 36, 50, 0.5)',
'shadowOffsetY': 5,
'color':'#009ad6',
}
}
)
.set_global_opts(title_opts=opts.TitleOpts(title='部分专业的最高分和最低分',pos_top='1%',pos_left='40%',
title_textstyle_opts=opts.TextStyleOpts(font_size=20,color='#fff000')),
legend_opts=opts.LegendOpts(is_show=True, pos_top='4%',pos_left='45%'),
datazoom_opts=opts.DataZoomOpts(type_='inside',
range_start=10, # 设置起止位置,50%-100%
range_end=100,
orient='vertical'),
xaxis_opts=opts.AxisOpts(is_show=False, max_=500),
yaxis_opts=opts.AxisOpts(axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
axislabel_opts=opts.LabelOpts(color='#00c6d7', font_size=12, font_weight='bold')),
)
).reversal_axis()
grid = (
Grid(init_opts=opts.InitOpts(theme='purple-passion', width='1000px', height='800px'))
.add(bar, grid_opts=opts.GridOpts(pos_top='8%', pos_left='18%', pos_right='5%'))
)
grid.render_notebook()
color_js = """new echarts.graphic.LinearGradient(0, 1, 0, 0,
[{offset: 0, color: '#FFFFFF'}, {offset: 1, color: '#ed1941'}], false)"""
range_colors=['#fff5f0','#fee0d2','#fcbba1','#fc9272','#fb6a4a','#ef3b2c','#cb181d','#99000d']
df_computer = df_2020.loc[df_2020['专业名称']=='计算机科学与技术',:]
df_computer = df_computer.sort_values(by='总分',ascending=False)[:20]
df_computer['总分'] = df_computer['总分'].astype('int')
df_computer.style.bar(subset=['总分'],color='#ed1941', vmin=300, vmax=360)
x_data = df_computer['学校名称'].values.tolist()
y_data = df_computer['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=360,
min_=300,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='计算机科学与技术专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=400,
min_=200,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_management = df_2020.loc[df_2020['专业名称'].str.contains('管理科学与工程'),:]
df_management = df_management.sort_values(by='总分',ascending=False)[:20]
x_data = df_management['学校名称'].values.tolist()
y_data = df_management['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=405,
min_=360,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='管理科学与工程专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=410,
min_=360,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_accountant = df_2020.loc[df_2020['专业名称'].str.contains('会计'),:]
df_accountant = df_accountant.sort_values(by='总分',ascending=False)[:20]
x_data = df_accountant['学校名称'].values.tolist()
y_data = df_accountant['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=400,
min_=350,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='会计专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=400,
min_=350,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_mba = df_2020.loc[df_2020['专业名称'].str.contains('工商管理'),:]
df_mba = df_mba.sort_values(by='总分',ascending=False)[:20]
x_data = df_mba['学校名称'].values.tolist()
y_data = df_mba['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=390,
min_=340,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='工商管理专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=390,
min_=340,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_law = df_2020.loc[df_2020['专业名称'].str.contains('法律'),:]
df_law = df_law.sort_values(by='总分',ascending=False)[:20]
x_data = df_law['学校名称'].values.tolist()
y_data = df_law['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=380,
min_=350,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='法律专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=380,
min_=350,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_finance = df_2020.loc[df_2020['专业名称'].str.contains('金融'),:]
df_finance = df_finance.sort_values(by='总分',ascending=False)[:20]
x_data = df_mba['学校名称'].values.tolist()
y_data = df_mba['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=390,
min_=340,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='金融专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=390,
min_=350,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_materials = df_2020.loc[df_2020['专业名称'].str.contains('材料科学与工程'),:]
df_materials = df_materials.sort_values(by='总分',ascending=False)[:20]
x_data = df_materials['学校名称'].values.tolist()
y_data = df_materials['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=340,
min_=290,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='材料科学与工程专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=340,
min_=290,
range_color=range_colors
),
)
)
bar2.render_notebook()
df_mechanical = df_2020.loc[df_2020['专业名称'].str.contains('机械工程'),:]
df_mechanical = df_mechanical.sort_values(by='总分',ascending=False)[:20]
x_data = df_mechanical['学校名称'].values.tolist()
y_data = df_mechanical['总分'].values.tolist()
bar2 = (
Bar(init_opts=opts.InitOpts(theme='chalk',width='1000px', height='800px'))
.add_xaxis(x_data)
.add_yaxis('', y_data,
category_gap='30%',
itemstyle_opts={
'normal': {
'shadowColor': 'rgba(0, 0, 0, .5)',
'shadowBlur': 5,
'shadowOffsetY': 2,
'shadowOffsetX': 2,
'borderColor': '#fff'
}
}
)
.set_series_opts(label_opts=opts.LabelOpts(font_weight='bold',font_size=12, color='#66d9e8' ))
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30,font_size=12,font_weight='bold', color="#41b6c4",margin=10),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False),
),
yaxis_opts=opts.AxisOpts(is_show=False,
max_=360,
min_=290,
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(is_show=False)
),
title_opts=opts.TitleOpts(
title='机械工程专业 TOP20 学校',
pos_left='center',
pos_top='4%',
title_textstyle_opts=opts.TextStyleOpts(color='#fec44f', font_size=24, font_weight='bold')
),
visualmap_opts=opts.VisualMapOpts(
is_show=False,
max_=360,
min_=290,
range_color=range_colors
),
)
)
bar2.render_notebook()
schools = df_2020['学校名称'].values.tolist()
pic_name = '学校名称.png'
stylecloud.gen_stylecloud(
text=' '.join(schools),
font_path=r'/home/mw/input/202201106373/STXINWEI.TTF',
palette='cartocolors.qualitative.Bold_5',
max_font_size=100,
icon_name='fas fa-graduation-cap',
background_color='#212529',
output_name=pic_name,
)
Image.open(pic_name)
major = df_2020['专业名称'].values.tolist()
pic_name = '专业名称.png'
stylecloud.gen_stylecloud(
text=' '.join(major),
font_path=r'/home/mw/input/202201106373/STXINWEI.TTF',
palette='cartocolors.qualitative.Bold_5',
max_font_size=100,
icon_name='fas fa-book-open',
background_color='#212529',
output_name=pic_name,
)
Image.open(pic_name)