• json 数据展平pd.json_normalize


    pandas中有一个牛逼的内置功能叫 .json_normalize

    pandas的文档中提到:将半结构化JSON数据规范化为平面表。

    1. def _json_normalize(
    2. data: Union[Dict, List[Dict]],
    3. record_path: Optional[Union[str, List]] = None,
    4. meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
    5. meta_prefix: Optional[str] = None,
    6. record_prefix: Optional[str] = None,
    7. errors: str = "raise",
    8. sep: str = ".",
    9. max_level: Optional[int] = None,
    10. ) -> "DataFrame":
    11. """
    12. Normalize semi-structured JSON data into a flat table.
    13. Parameters
    14. ----------
    15. data : dict or list of dicts
    16. Unserialized JSON objects.
    17. record_path : str or list of str, default None
    18. Path in each object to list of records. If not passed, data will be
    19. assumed to be an array of records.
    20. meta : list of paths (str or list of str), default None
    21. Fields to use as metadata for each record in resulting table.
    22. meta_prefix : str, default None
    23. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
    24. meta is ['foo', 'bar'].
    25. record_prefix : str, default None
    26. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
    27. path to records is ['foo', 'bar'].
    28. errors : {'raise', 'ignore'}, default 'raise'
    29. Configures error handling.
    30. * 'ignore' : will ignore KeyError if keys listed in meta are not
    31. always present.
    32. * 'raise' : will raise KeyError if keys listed in meta are not
    33. always present.
    34. sep : str, default '.'
    35. Nested records will generate names separated by sep.
    36. e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
    37. max_level : int, default None
    38. Max number of levels(depth of dict) to normalize.
    39. if None, normalizes all levels.
    40. .. versionadded:: 0.25.0
    41. Returns
    42. -------
    43. frame : DataFrame
    44. Normalize semi-structured JSON data into a flat table.
    45. Examples
    46. --------
    47. >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
    48. ... {'name': {'given': 'Mose', 'family': 'Regner'}},
    49. ... {'id': 2, 'name': 'Faye Raker'}]
    50. >>> pd.json_normalize(data)
    51. id name.first name.last name.given name.family name
    52. 0 1.0 Coleen Volk NaN NaN NaN
    53. 1 NaN NaN NaN Mose Regner NaN
    54. 2 2.0 NaN NaN NaN NaN Faye Raker
    55. >>> data = [{'id': 1,
    56. ... 'name': "Cole Volk",
    57. ... 'fitness': {'height': 130, 'weight': 60}},
    58. ... {'name': "Mose Reg",
    59. ... 'fitness': {'height': 130, 'weight': 60}},
    60. ... {'id': 2, 'name': 'Faye Raker',
    61. ... 'fitness': {'height': 130, 'weight': 60}}]
    62. >>> pd.json_normalize(data, max_level=0)
    63. id name fitness
    64. 0 1.0 Cole Volk {'height': 130, 'weight': 60}
    65. 1 NaN Mose Reg {'height': 130, 'weight': 60}
    66. 2 2.0 Faye Raker {'height': 130, 'weight': 60}
    67. Normalizes nested data up to level 1.
    68. >>> data = [{'id': 1,
    69. ... 'name': "Cole Volk",
    70. ... 'fitness': {'height': 130, 'weight': 60}},
    71. ... {'name': "Mose Reg",
    72. ... 'fitness': {'height': 130, 'weight': 60}},
    73. ... {'id': 2, 'name': 'Faye Raker',
    74. ... 'fitness': {'height': 130, 'weight': 60}}]
    75. >>> pd.json_normalize(data, max_level=1)
    76. id name fitness.height fitness.weight
    77. 0 1.0 Cole Volk 130 60
    78. 1 NaN Mose Reg 130 60
    79. 2 2.0 Faye Raker 130 60
    80. >>> data = [{'state': 'Florida',
    81. ... 'shortname': 'FL',
    82. ... 'info': {'governor': 'Rick Scott'},
    83. ... 'counties': [{'name': 'Dade', 'population': 12345},
    84. ... {'name': 'Broward', 'population': 40000},
    85. ... {'name': 'Palm Beach', 'population': 60000}]},
    86. ... {'state': 'Ohio',
    87. ... 'shortname': 'OH',
    88. ... 'info': {'governor': 'John Kasich'},
    89. ... 'counties': [{'name': 'Summit', 'population': 1234},
    90. ... {'name': 'Cuyahoga', 'population': 1337}]}]
    91. >>> result = pd.json_normalize(data, 'counties', ['state', 'shortname',
    92. ... ['info', 'governor']])
    93. >>> result
    94. name population state shortname info.governor
    95. 0 Dade 12345 Florida FL Rick Scott
    96. 1 Broward 40000 Florida FL Rick Scott
    97. 2 Palm Beach 60000 Florida FL Rick Scott
    98. 3 Summit 1234 Ohio OH John Kasich
    99. 4 Cuyahoga 1337 Ohio OH John Kasich
    100. >>> data = {'A': [1, 2]}
    101. >>> pd.json_normalize(data, 'A', record_prefix='Prefix.')
    102. Prefix.0
    103. 0 1
    104. 1 2
    105. Returns normalized data with columns prefixed with the given string.
    106. """
    107. def _pull_field(
    108. js: Dict[str, Any], spec: Union[List, str]
    109. ) -> Union[Scalar, Iterable]:
    110. """Internal function to pull field"""
    111. result = js
    112. if isinstance(spec, list):
    113. for field in spec:
    114. result = result[field]
    115. else:
    116. result = result[spec]
    117. return result
    118. def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:
    119. """
    120. Internal function to pull field for records, and similar to
    121. _pull_field, but require to return list. And will raise error
    122. if has non iterable value.
    123. """
    124. result = _pull_field(js, spec)
    125. # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
    126. # null, otherwise return an empty list
    127. if not isinstance(result, list):
    128. if pd.isnull(result):
    129. result = []
    130. else:
    131. raise TypeError(
    132. f"{js} has non list value {result} for path {spec}. "
    133. "Must be list or null."
    134. )
    135. return result
    136. if isinstance(data, list) and not data:
    137. return DataFrame()
    138. # A bit of a hackjob
    139. if isinstance(data, dict):
    140. data = [data]
    141. if record_path is None:
    142. if any([isinstance(x, dict) for x in y.values()] for y in data):
    143. # naive normalization, this is idempotent for flat records
    144. # and potentially will inflate the data considerably for
    145. # deeply nested structures:
    146. # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
    147. #
    148. # TODO: handle record value which are lists, at least error
    149. # reasonably
    150. data = nested_to_record(data, sep=sep, max_level=max_level)
    151. return DataFrame(data)
    152. elif not isinstance(record_path, list):
    153. record_path = [record_path]
    154. if meta is None:
    155. meta = []
    156. elif not isinstance(meta, list):
    157. meta = [meta]
    158. _meta = [m if isinstance(m, list) else [m] for m in meta]
    159. # Disastrously inefficient for now
    160. records: List = []
    161. lengths = []
    162. meta_vals: DefaultDict = defaultdict(list)
    163. meta_keys = [sep.join(val) for val in _meta]
    164. def _recursive_extract(data, path, seen_meta, level=0):
    165. if isinstance(data, dict):
    166. data = [data]
    167. if len(path) > 1:
    168. for obj in data:
    169. for val, key in zip(_meta, meta_keys):
    170. if level + 1 == len(val):
    171. seen_meta[key] = _pull_field(obj, val[-1])
    172. _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
    173. else:
    174. for obj in data:
    175. recs = _pull_records(obj, path[0])
    176. recs = [
    177. nested_to_record(r, sep=sep, max_level=max_level)
    178. if isinstance(r, dict)
    179. else r
    180. for r in recs
    181. ]
    182. # For repeating the metadata later
    183. lengths.append(len(recs))
    184. for val, key in zip(_meta, meta_keys):
    185. if level + 1 > len(val):
    186. meta_val = seen_meta[key]
    187. else:
    188. try:
    189. meta_val = _pull_field(obj, val[level:])
    190. except KeyError as e:
    191. if errors == "ignore":
    192. meta_val = np.nan
    193. else:
    194. raise KeyError(
    195. "Try running with errors='ignore' as key "
    196. f"{e} is not always present"
    197. ) from e
    198. meta_vals[key].append(meta_val)
    199. records.extend(recs)
    200. _recursive_extract(data, record_path, {}, level=0)
    201. result = DataFrame(records)
    202. if record_prefix is not None:
    203. result = result.rename(columns=lambda x: f"{record_prefix}{x}")
    204. # Data types, a problem
    205. for k, v in meta_vals.items():
    206. if meta_prefix is not None:
    207. k = meta_prefix + k
    208. if k in result:
    209. raise ValueError(
    210. f"Conflicting metadata name {k}, need distinguishing prefix "
    211. )
    212. result[k] = np.array(v, dtype=object).repeat(lengths)
    213. return result

    json_normalize()函数参数讲解

    参数名解释
    data未解析的Json对象,也可以是Json列表对象
    record_path列表或字符串,如果Json对象中的嵌套列表未在此设置,则完成解析后会直接将其整个列表存储到一列中展示
    metaJson对象中的,存在多层数据时也可以进行嵌套标记
    meta_prefix键的前缀
    record_prefix嵌套列表的前缀
    errors错误信息,可设置为ignore,表示如果key不存在则忽略错误,也可设置为raise,表示如果key不存在则报错进行提示。默认值为raise
    sep多层key之间的分隔符,默认值是.(一个点)
    max_level解析Json对象的最大层级数,适用于有多层嵌套的Json对象

    在进行代码演示前先导入相应依赖库,未安装pandas库的请自行安装(此代码在Jupyter Notebook环境中运行)。

    1. from pandas import json_normalize
    2. import pandas as pd

    1. 解析一个最基本的Json

    a. 解析一般Json对象

    1. a_dict = {
    2. 'school': 'ABC primary school',
    3. 'location': 'London',
    4. 'ranking': 2
    5. }
    6. pd.json_normalize(a_dict)

    输出结果为:

    image-20210809004551266

    b. 解析一个Json对象列表

    1. json_list = [
    2. {'class': 'Year 1', 'student number': 20, 'room': 'Yellow'},
    3. {'class': 'Year 2', 'student number': 25, 'room': 'Blue'}
    4. ]
    5. pd.json_normalize(json_list)

    输出结果为:

    image-20210809004918648

    2. 解析一个带有多层数据的Json

    a. 解析一个有多层数据的Json对象

    1. json_obj = {
    2. 'school': 'ABC primary school',
    3. 'location': 'London',
    4. 'ranking': 2,
    5. 'info': {
    6. 'president': 'John Kasich',
    7. 'contacts': {
    8. 'email': {
    9. 'admission': 'admission@abc.com',
    10. 'general': 'info@abc.com'
    11. },
    12. 'tel': '123456789',
    13. }
    14. }
    15. }
    16. pd.json_normalize(json_obj)

    输出结果为:
    在这里插入图片描述

    多层key之间使用点隔开,展示了所有的数据,这已经解析了3层,上述写法和pd.json_normalize(json_obj, max_level=3)等价。

    如果设置max_level=1,则输出结果为下图所示,contacts部分的数据汇集成了一列
    image-20210809010458109

    如果设置max_level=2,则输出结果为下图所示,contacts 下的email部分的数据汇集成了一列
    image-20210809010630386

    b. 解析一个有多层数据的Json对象列表

    1. json_list = [
    2. {
    3. 'class': 'Year 1',
    4. 'student count': 20,
    5. 'room': 'Yellow',
    6. 'info': {
    7. 'teachers': {
    8. 'math': 'Rick Scott',
    9. 'physics': 'Elon Mask'
    10. }
    11. }
    12. },
    13. {
    14. 'class': 'Year 2',
    15. 'student count': 25,
    16. 'room': 'Blue',
    17. 'info': {
    18. 'teachers': {
    19. 'math': 'Alan Turing',
    20. 'physics': 'Albert Einstein'
    21. }
    22. }
    23. }
    24. ]
    25. pd.json_normalize(json_list)

    输出结果为:

    image-20210809010907715

    若分别将max_level设置为23,则输出结果应分别是什么?请自行尝试~

    3. 解析一个带有嵌套列表的Json

    1. json_obj = {
    2. 'school': 'ABC primary school',
    3. 'location': 'London',
    4. 'ranking': 2,
    5. 'info': {
    6. 'president': 'John Kasich',
    7. 'contacts': {
    8. 'email': {
    9. 'admission': 'admission@abc.com',
    10. 'general': 'info@abc.com'
    11. },
    12. 'tel': '123456789',
    13. }
    14. },
    15. 'students': [
    16. {'name': 'Tom'},
    17. {'name': 'James'},
    18. {'name': 'Jacqueline'}
    19. ],
    20. }
    21. pd.json_normalize(json_obj)

    此例中students键对应的值是一个列表,使用[]括起来。直接采用上述的方法进行解析,则得到的结果如下:

    image-20210809011425013

    students部分的数据并未被成功解析,此时可以为record_path设置值即可,调用方式为pd.json_normalize(json_obj, record_path='students'),在此调用方式下,得到的结果只包含了name部分的数据。

    image-20210809011803409

    若要增加其他字段的信息,则需为meta参数赋值,例如下述调用方式下,得到的结果如下:

    pd.json_normalize(json_obj, record_path='students', meta=['school', 'location', ['info', 'contacts', 'tel'], ['info', 'contacts', 'email', 'general']])
    

    image-20210809012221592

    4. 当Key不存在时如何忽略系统报错

    1. data = [
    2. {
    3. 'class': 'Year 1',
    4. 'student count': 20,
    5. 'room': 'Yellow',
    6. 'info': {
    7. 'teachers': {
    8. 'math': 'Rick Scott',
    9. 'physics': 'Elon Mask',
    10. }
    11. },
    12. 'students': [
    13. { 'name': 'Tom', 'sex': 'M' },
    14. { 'name': 'James', 'sex': 'M' },
    15. ]
    16. },
    17. {
    18. 'class': 'Year 2',
    19. 'student count': 25,
    20. 'room': 'Blue',
    21. 'info': {
    22. 'teachers': {
    23. # no math teacher
    24. 'physics': 'Albert Einstein'
    25. }
    26. },
    27. 'students': [
    28. { 'name': 'Tony', 'sex': 'M' },
    29. { 'name': 'Jacqueline', 'sex': 'F' },
    30. ]
    31. },
    32. ]
    33. pd.json_normalize(
    34. data,
    35. record_path =['students'],
    36. meta=['class', 'room', ['info', 'teachers', 'math']]
    37. )

    class等于Year 2的Json对象中,teachers下的math键不存在,直接运行上述代码会报以下错误,提示math键并不总是存在,且给出了相应建议:Try running with errors='ignore'

    image-20210809013031010

    添加errors条件后,重新运行得出的结果如下图所示,没有math键的部分使用NaN进行了填补。

    1. pd.json_normalize(
    2. data,
    3. record_path =['students'],
    4. meta=['class', 'room', ['info', 'teachers', 'math']],
    5. errors='ignore'
    6. )

    在这里插入图片描述

    5. 使用sep参数为嵌套Json的Key设置分隔符

    2.a的案例中,可以注意到输出结果的具有多层key的数据列标题是采用.对多层key进行分隔的,可以为sep赋值以更改分隔符。

    1. json_obj = {
    2. 'school': 'ABC primary school',
    3. 'location': 'London',
    4. 'ranking': 2,
    5. 'info': {
    6. 'president': 'John Kasich',
    7. 'contacts': {
    8. 'email': {
    9. 'admission': 'admission@abc.com',
    10. 'general': 'info@abc.com'
    11. },
    12. 'tel': '123456789',
    13. }
    14. }
    15. }
    16. pd.json_normalize(json_obj, sep='->')

    输出结果为:

    image-20210809013826422

    6. 为嵌套列表数据和元数据添加前缀

    3例的输出结果中,各列名均无前缀,例如name这一列不知是元数据解析得到的数据,还是通过student嵌套列表的的出的数据,因此为record_prefixmeta_prefix参数分别赋值,即可为输出结果添加相应前缀。

    1. json_obj = {
    2. 'school': 'ABC primary school',
    3. 'location': 'London',
    4. 'ranking': 2,
    5. 'info': {
    6. 'president': 'John Kasich',
    7. 'contacts': {
    8. 'email': {
    9. 'admission': 'admission@abc.com',
    10. 'general': 'info@abc.com'
    11. },
    12. 'tel': '123456789',
    13. }
    14. },
    15. 'students': [
    16. {'name': 'Tom'},
    17. {'name': 'James'},
    18. {'name': 'Jacqueline'}
    19. ],
    20. }
    21. pd.json_normalize(json_obj, record_path='students',
    22. meta=['school', 'location', ['info', 'contacts', 'tel'], ['info', 'contacts', 'email', 'general']],
    23. record_prefix='students->',
    24. meta_prefix='meta->',
    25. sep='->')

    本例中,为嵌套列表数据添加students->前缀,为元数据添加meta->前缀,将嵌套key之间的分隔符修改为->,输出结果为:

    image-20210809014638173

    7. 通过URL获取Json数据并进行解析

    通过URL获取数据需要用到requests库,请自行安装相应库。

    1. import requests
    2. from pandas import json_normalize
    3. # 通过天气API,获取深圳近7天的天气
    4. url = 'https://tianqiapi.com/free/week'
    5. # 传入url,并设定好相应的params
    6. r = requests.get(url, params={"appid":"59257444", "appsecret":"uULlTGV9 ", 'city':'深圳'})
    7. # 将获取到的值转换为json对象
    8. result = r.json()
    9. df = json_normalize(result, meta=['city', 'cityid', 'update_time'], record_path=['data'])
    10. df

    result的结果如下所示,其中data为一个嵌套列表:

    1. {'cityid': '101280601',
    2. 'city': '深圳',
    3. 'update_time': '2021-08-09 06:39:49',
    4. 'data': [{'date': '2021-08-09',
    5. 'wea': '中雨转雷阵雨',
    6. 'wea_img': 'yu',
    7. 'tem_day': '32',
    8. 'tem_night': '26',
    9. 'win': '无持续风向',
    10. 'win_speed': '<3级'},
    11. {'date': '2021-08-10',
    12. 'wea': '雷阵雨',
    13. 'wea_img': 'yu',
    14. 'tem_day': '32',
    15. 'tem_night': '27',
    16. 'win': '无持续风向',
    17. 'win_speed': '<3级'},
    18. {'date': '2021-08-11',
    19. 'wea': '雷阵雨',
    20. 'wea_img': 'yu',
    21. 'tem_day': '31',
    22. 'tem_night': '27',
    23. 'win': '无持续风向',
    24. 'win_speed': '<3级'},
    25. {'date': '2021-08-12',
    26. 'wea': '多云',
    27. 'wea_img': 'yun',
    28. 'tem_day': '33',
    29. 'tem_night': '27',
    30. 'win': '无持续风向',
    31. 'win_speed': '<3级'},
    32. {'date': '2021-08-13',
    33. 'wea': '多云',
    34. 'wea_img': 'yun',
    35. 'tem_day': '33',
    36. 'tem_night': '27',
    37. 'win': '无持续风向',
    38. 'win_speed': '<3级'},
    39. {'date': '2021-08-14',
    40. 'wea': '多云',
    41. 'wea_img': 'yun',
    42. 'tem_day': '32',
    43. 'tem_night': '27',
    44. 'win': '无持续风向',
    45. 'win_speed': '<3级'},
    46. {'date': '2021-08-15',
    47. 'wea': '多云',
    48. 'wea_img': 'yun',
    49. 'tem_day': '32',
    50. 'tem_night': '27',
    51. 'win': '无持续风向',
    52. 'win_speed': '<3级'}]}

    解析后的输出结果为:

    image-20210809024545188

    8. 探究:解析带有多个嵌套列表的Json

    当一个Json对象或对象列表中有超过一个嵌套列表时,record_path无法将所有的嵌套列表包含进去,因为它只能接收一个key值。此时,我们需要先根据多个嵌套列表的key将Json解析成多个DataFrame,再将这些DataFrame根据实际关联条件拼接起来,并去除重复值。

    1. json_obj = {
    2. 'school': 'ABC primary school',
    3. 'location': 'shenzhen',
    4. 'ranking': 2,
    5. 'info': {
    6. 'president': 'John Kasich',
    7. 'contacts': {
    8. 'email': {
    9. 'admission': 'admission@abc.com',
    10. 'general': 'info@abc.com'
    11. },
    12. 'tel': '123456789',
    13. }
    14. },
    15. 'students': [
    16. {'name': 'Tom'},
    17. {'name': 'James'},
    18. {'name': 'Jacqueline'}
    19. ],
    20. # 添加university嵌套列表,加上students,该JSON对象中就有2个嵌套列表了
    21. 'university': [
    22. {'university_name': 'HongKong university shenzhen'},
    23. {'university_name': 'zhongshan university shenzhen'},
    24. {'university_name': 'shenzhen university'}
    25. ],
    26. }
    27. # 尝试在record_path中写上两个嵌套列表的名字,即record_path = ['students', 'university],结果无济于事
    28. # 于是决定分两次进行解析,分别将record_path设置成为university和students,最终将2个结果合并起来
    29. df1 = pd.json_normalize(json_obj, record_path=['university'],
    30. meta=['school', 'location', ['info', 'contacts', 'tel'],
    31. ['info', 'contacts', 'email', 'general']],
    32. record_prefix='university->',
    33. meta_prefix='meta->',
    34. sep='->')
    35. df2 = pd.json_normalize(json_obj, record_path=['students'],
    36. meta=['school', 'location', ['info', 'contacts', 'tel'],
    37. ['info', 'contacts', 'email', 'general']],
    38. record_prefix='students->',
    39. meta_prefix='meta->',
    40. sep='->')
    41. # 将两个结果根据index关联起来并去除重复列
    42. df1.merge(df2, how='left', left_index=True, right_index=True, suffixes=['->', '->']).T.drop_duplicates().T

    输出结果为:

    image-20210809023633761

    途中红框标出来的部分为Json对象中所对应的两个嵌套列表。

    总结

    json_normalize()方法异常强大,几乎涵盖了所有解析JSON的场景,涉及到一些更复杂场景时,可以给予已有的功能进行发散整合,例如8. 探究中遇到的问题一样。

    拥有了这个强大的Json解析库,以后再也不怕遇到复杂的Json数据了!

    参考:

    你必须知道的Pandas 解析json数据的函数-json_normalize()_SageCat郭帅强的博客-CSDN博客_json_normalize写了好几篇文章了,今天写点很少人写但是很有用的!记得点赞收藏加关注哦。前言:Json数据介绍Json是一个应用及其广泛的用来传输和交换数据的格式,它被应用在数据库中,也被用于API请求结果数据集中。虽然它应用广泛,机器很容易阅读且节省空间,但是却不利于人来阅读和进一步做数据分析,因此通常情况下需要在获取json数据后,将其转化为表格格式的数据,以方便人来阅读和理解。常见的Json数据格式有2种,均以键值对的形式存储数据,只是包装数据的方法有所差异:a. 一般JSON对象采用{}将键值对数据括起来.https://blog.csdn.net/Roach007/article/details/119529772

  • 相关阅读:
    Mac配置JDK安装Jmeter
    【离线/并查集】CF1213 G
    公元是什么意思,公历和农历
    HTML学生个人网站作业设计成品 HTML+CSS肖战明星人物介绍网页 web结课作业的源码
    基于stm3210系列的简单DMA通信
    【计算轨迹误差】
    C#练习题16和17
    Nginx学习笔记09——URLRewrite伪静态
    【k8s】3、kubeadm安装k8s集群
    软考整体管理思维导图
  • 原文地址:https://blog.csdn.net/xiao_yi_xiao/article/details/125558952