• 数据库:Hive转Presto(五)


    此篇将所有代码都补充完了,之前发现有的代码写错了,以这篇为准,以下为完整代码,如果发现我有什么考虑不周的地方,可以评论提建议,感谢。代码是想哪写哪,可能比较繁琐,还需要优化。

    1. import re
    2. import os
    3. import tkinter.filedialog
    4. from tkinter import *
    5. class Hive2Presto:
    6. def __int__(self):
    7. self.t_funcs = ['substr', 'nvl', 'substring', 'unix_timestamp'] + \
    8. ['to_date', 'concat', 'sum', 'avg', 'abs', 'year', 'month', 'ceiling', 'floor']
    9. self.time_funcs = ['date_add', 'datediff', 'add_months', 'date_sub']
    10. self.funcs = self.t_funcs + self.time_funcs
    11. self.current_path = os.path.abspath(__file__)
    12. self.dir = os.path.dirname(self.current_path)
    13. self.result = []
    14. self.error = []
    15. self.filename = ''
    16. def main(self):
    17. self.root = Tk()
    18. self.root.config(bg='#ff741d') # 背景颜色设置为公司主题色^_^
    19. self.root.title('Hive转Presto')
    20. self.win_width = 550
    21. self.win_height = 500
    22. self.screen_width = self.root.winfo_screenwidth()
    23. self.screen_height = self.root.winfo_screenheight()
    24. self.x = (self.screen_width - self.win_width) // 2
    25. self.y = (self.screen_height - self.win_height) // 2
    26. self.root.geometry(f'{self.win_width}x{self.win_height}+{self.x}+{self.y}')
    27. font = ('楷体', 11)
    28. self.button = Button(self.root, text='转换', command=self.trans, bg='#ffcc8c', font=font, anchor='e')
    29. self.button.grid(row=0, column=0, padx=100, pady=10, sticky=W)
    30. self.file_button = Button(self.root, text='选择文件', command=self.choose_file, bg='#ffcc8c', font=font,
    31. anchor='e')
    32. self.file_button.grid(row=0, column=1, padx=0, pady=10, sticky=W)
    33. self.entry = Entry(self.root, width=65, font=font)
    34. self.entry.insert(0, '输入Hive代码')
    35. self.entry.grid(row=1, column=0, padx=10, pady=10, columnspan=2)
    36. self.entry.bind('', self.delete_text)
    37. self.text = Text(self.root, width=75, height=20)
    38. self.text.grid(row=2, column=0, padx=10, pady=10, columnspan=2)
    39. self.des_label = Label(self.root, text='可以复制结果,也有生成的文件,与选取的文件同文件夹', bg='#ffcc8c',
    40. font=('楷体', 10))
    41. self.des_label.grid(row=3, column=0, padx=10, pady=10, columnspan=2)
    42. s = ''
    43. for i in range(0, (n := len(self.funcs)), 4):
    44. if i + 4 <= n:
    45. s += ','.join(self.funcs[i:i + 4]) + '\n'
    46. else:
    47. s += ','.join(self.funcs[i:]) + '\n'
    48. s = s[:-1]
    49. self.des_label1 = Label(self.root, text=s, bg='#ffcc8c',
    50. font=('楷体', 10))
    51. self.des_label1.grid(row=4, column=0, padx=10, pady=10, columnspan=2)
    52. self.root.columnconfigure(0, minsize=10)
    53. self.root.columnconfigure(1, minsize=10)
    54. self.root.columnconfigure(0, pad=5)
    55. self.root.mainloop()
    56. def replace_func(self, s, res):
    57. """
    58. 把搜索到函数整体取出来,处理括号中的参数
    59. :param s:
    60. :param res:
    61. :return:
    62. """
    63. for f in res:
    64. f1 = f.replace('\n', '').strip()
    65. f1 = re.sub(r'(\(s*)', '(', f1)
    66. # 搜索括号里的字符串
    67. if re.findall(r'(\w+)\(', f1):
    68. func_name = re.findall(r'(\w+)\(', f1)[0].strip()
    69. else:
    70. continue
    71. try:
    72. if 'date_add' == func_name.lower():
    73. date, date_num = self.extact_func(f1, func_name)
    74. s_n = f"date_add('day',{date_num},cast(substr(cast{date} as varchar,1,10) as date))"
    75. s = s.replace(f, s_n)
    76. elif 'datediff' == func_name.lower():
    77. date1, date2 = self.extact_func(f1, func_name)
    78. s_n = f"date_add('day',{date2},cast(substr(cast{date1} as varchar,1,10) as date),cast(substr(cast{date1} as varchar),1,10) as date))"
    79. s = s.replace(f, s_n)
    80. elif 'nvl' == func_name.lower():
    81. s1, s2 = self.extact_func(f1, func_name)
    82. s_n = f"coalesce({s1},{s2})"
    83. s = s.replace(f, s_n)
    84. elif 'substr' == func_name.lower():
    85. date, start, end = self.extact_func(f1, func_name)
    86. s_n = f"substr(cast({date} as varchar),{start},{end}"
    87. s = s.replace(f, s_n)
    88. elif 'substring' == func_name.lower():
    89. date, start, end = self.extact_func(f1, func_name)
    90. s_n = f"substring(cast({date} as varchar),{start},{end}"
    91. s = s.replace(f, s_n)
    92. elif 'unit_timestamp' == func_name.lower():
    93. date = self.extact_func(f1, func_name)[0]
    94. s_n = f"to_unixtime(cast({date} as timestanp))"
    95. s = s.replace(f, s_n)
    96. elif 'to_date' == func_name.lower():
    97. date = self.extact_func(f1, func_name)[0]
    98. s_n = f"cast({date} as date)"
    99. s = s.replace(f, s_n)
    100. elif 'concat' == func_name.lower():
    101. res = self.extact_func(f1, func_name)[0]
    102. s_n = f'concat('
    103. for r in res:
    104. r = r.strip().replace('\n', '')
    105. s_n += f"cast({r} as varchar),"
    106. s_n = s_n[:-1] + ')'
    107. s = s.replace(f, s_n)
    108. elif 'sum' == func_name.lower():
    109. if 'unix_timestamp' in f1 or 'to_unixtime' in f1:
    110. continue
    111. ss = self.extact_func(f1, func_name)[0]
    112. if 'if(' in ss.replace(' ', ''):
    113. continue
    114. s = self.func_trans(f, f1, func_name, ss, s)
    115. elif 'avg' == func_name.lower():
    116. if 'unix_timestamp' in f1 or 'to_unixtime' in f1:
    117. continue
    118. ss = self.extact_func(f1, func_name)[0]
    119. if 'if(' in ss.replace(' ', ''):
    120. continue
    121. s = self.func_trans(f, f1, func_name, ss, s)
    122. elif 'abs' == func_name.lower():
    123. if 'unix_timestamp' in f1 or 'to_unixtime' in f1:
    124. continue
    125. ss = self.extact_func(f1, func_name)[0]
    126. if 'if(' in ss.replace(' ', ''):
    127. continue
    128. s = self.func_trans(f, f1, func_name, ss, s)
    129. elif 'ceiling' == func_name.lower():
    130. if 'unix_timestamp' in f1 or 'to_unixtime' in f1:
    131. continue
    132. ss = self.extact_func(f1, func_name)[0]
    133. if 'if(' in ss.replace(' ', ''):
    134. continue
    135. s = self.func_trans(f, f1, func_name, ss, s)
    136. elif 'floor' == func_name.lower():
    137. if 'unix_timestamp' in f1 or 'to_unixtime' in f1:
    138. continue
    139. ss = self.extact_func(f1, func_name)[0]
    140. if 'if(' in ss.replace(' ', ''):
    141. continue
    142. s = self.func_trans(f, f1, func_name, ss, s)
    143. elif 'year' == func_name.lower():
    144. date = self.extact_func(f1, func_name)[0]
    145. s_n = f"year(cast(substr(cast({date} as varchar,1,10) as date))"
    146. s = s.replace(f, s_n)
    147. elif 'month' == func_name.lower():
    148. date = self.extact_func(f1, func_name)[0]
    149. s_n = f"month(cast(substr(cast({date} as varchar,1,10) as date))"
    150. s = s.replace(f, s_n)
    151. elif 'date_sub' == func_name.lower():
    152. date, date_num = self.extact_func(f1, func_name)
    153. s_n = f"date_add('day',-{date_num},cast(substr(cast{date} as varchar,1,10) as date))"
    154. s = s.replace(f, s_n)
    155. except:
    156. self.error.append(f"源代码中{func_name}函数参数输入可能有错误,具体为:{f1}")
    157. continue
    158. if self.error:
    159. self.entry.delete(0, END)
    160. self.text.delete("1.0", END)
    161. self.text.insert("end", f"{s}")
    162. self.error.insert(0, '转换失败,有部分没有转成功\n')
    163. root_ex = Tk()
    164. root_ex.title('错误')
    165. win_width = 600
    166. win_height = 200
    167. screen_width = root_ex.winfo_screenwidth()
    168. screen_height = root_ex.winfo_screenheight()
    169. x = (screen_width - win_width) // 2
    170. y = (screen_height - win_height) // 2
    171. root_ex.geometry(f'{win_width}x{win_height}+{x}+{y}')
    172. label_ex = Label(root_ex, text="\n".join(self.error), font=("楷体", 10))
    173. label_ex.pack()
    174. root_ex.mainloop()
    175. return s
    176. def func_trans(self, f, f1, func_name, ss, s):
    177. if not ('+' in ss or '-' in ss or '*' in ss or '/' in ss):
    178. date = self.extact_func(f1, func_name)[0]
    179. s_n = f'{func_name}(cast{date} as double))'
    180. s = s.replace(f, s_n)
    181. else:
    182. res1 = self.mysplit(f1)
    183. s_n = f
    184. n = len(s_n)
    185. for item in res1:
    186. if any(c.isalpha() for c in item.replace(' ', '')):
    187. idxs = s_n.find(item)
    188. idxs = [idxs] if type(idxs) != list else idxs
    189. for idx in idxs:
    190. if idx + len(item) + 3 <= n:
    191. if not 'as' in s_n[idx:idx + len(item) + 4]:
    192. s_n = re.sub(rf'\b{item}\b', f'cast({item} as double)', s_n)
    193. else:
    194. s_n = re.sub(rf'\b{item}\b', f'cast({item} as double)', s_n)
    195. s = s.replace(f, s_n)
    196. return s
    197. def choose_file(self):
    198. """
    199. 如果代码太多,从text中输入会很卡,直接选择代码文件输入会很快
    200. :return:
    201. """
    202. self.filename = tkinter.filedialog.askopenfilename()
    203. if '/' in self.filename:
    204. self.filename = self.filename.replace('/', '\\')
    205. self.entry.delete(0, END)
    206. self.entry.insert(0, self.filename)
    207. def findvar(self, ss):
    208. """
    209. 搜索与计算有关的字段
    210. :param ss:
    211. :return:
    212. """
    213. global r1
    214. b = ['+', '-', '*', '/', '=', '!=', '>', '<', '<=', '>=', '<>']
    215. result1 = []
    216. result2 = []
    217. result1_n = []
    218. result2_n = []
    219. res_ops = []
    220. res1_ops = []
    221. res_adj = []
    222. res1_adj = []
    223. for op in b:
    224. s_temp1 = ss.replace('\n', ' ')
    225. s_temp2 = ss.replace('\n', ' ')
    226. s_temp3 = ss.replace('\n', ' ')
    227. if op == '/' or op == '=':
    228. op = op
    229. elif op == '+' or op == '-' or op == '*' or op == '>' or op == '<':
    230. op = f'\\{op[0]}'
    231. else:
    232. op = f'\\{op[0]}\\{op[1]}'
    233. parttern = f'\s*-*\d+\s*{op}\s*\w+|' + f'\s*-*\d+\.\s*{op}\s*\w+\.\s*\w+|' \
    234. + f'\s*\w+\.\s*\w+\s*{op}\s*\w+\.\s*\w+|' + f'\s*\w+\s*{op}\s*\w+\.\s*\w+|' \
    235. + f'\s*\w+\.\s*\w+\s*{op}\s*\w+|' + f'\s*\w+\s*{op}\s*\w+'
    236. parttern1 = f'\s*\)+\s*{op}\s*\w+|' + f'\s*\)+\s*{op}\s*\w+\.\s*\w+|' \
    237. + f'\s*\w+\s*{op}\s*\(+|' + f'f\s*\w+\.\s*{op}\s*\(+'
    238. parttern2 = f'\s*\w+\s*{op}\s*\w+|' + f'\s*\w+\s*{op}\s*\w+\.\s*\w+|' \
    239. + f'\s*\w+\s*{op}\s*\w+|' + f'f\s*\w+\.\s*{op}\s*\w+'
    240. while True:
    241. res = re.findall(parttern, s_temp1)
    242. if not res:
    243. break
    244. result2.extend(res)
    245. for r in res:
    246. r1 = r.replace(' ', '').split(f'op')
    247. result1.append(r1)
    248. res_ops.append(f'{op}')
    249. res_adj.append(False)
    250. s_temp1 = s_temp1.replace(f'{r1[0]}', '')
    251. # 搜索带括号的计算
    252. if op == '+' or op == '-' or op == '*' or op == '/':
    253. while True:
    254. res = re.findall(parttern1, s_temp2)
    255. if not res:
    256. break
    257. result2.extend(res)
    258. for r in res:
    259. r1 = r.replace(' ', '').split(f'{op}')
    260. result1.append(r1)
    261. res_ops.append(f'{op}')
    262. res_adj.append(False)
    263. tem = r1[0] if r1[0].strip() not in ['(', ')'] else r1[1]
    264. s_temp2 = s_temp2.replace(f'{tem}', '')
    265. else:
    266. res = re.findall(parttern2, s_temp3)
    267. result2.extend(res)
    268. for r in res:
    269. r1 = r.replace(' ', '').split(f'{op}')
    270. result1.append(r1)
    271. res_ops.append(f'{op}')
    272. res_adj.append(True)
    273. str_ = re.findall(r'\'([^\']*)\'', ss)
    274. str_ = list(set(str_))
    275. str_ = [v.rstrip(' \n') for v in str_]
    276. for i, fun in enumerate(result1):
    277. flag = 0
    278. for item in fun:
    279. if any(item.strip() in v for v in str_) or any(item.strip() == v for v in self.t_funcs):
    280. break
    281. flag += 1
    282. if flag == 2 and result1[i] not in result1_n:
    283. result1_n.append(result1[i])
    284. result2_n.append(result2[i])
    285. res1_ops.append(res_ops[i])
    286. adj = result1[i][0] in self.time_funcs or result1[i][0] in self.time_funcs
    287. res1_adj.append(adj)
    288. if result1_n:
    289. z = zip(result1_n, result2_n, res1_ops, res1_adj)
    290. z1 = sorted(z, key=lambda x: len(x[1].replace(' ', '')), reverse=True)
    291. result1_n, result2_n, res1_ops, res1_adj = zip(*z1)
    292. return result1_n, result2_n, res1_ops, res1_adj
    293. def mysplit(self, s):
    294. """
    295. 分割字段
    296. :param s:
    297. :return:
    298. """
    299. s = s.strip().replace(')', '').replace('(', '')
    300. b = ['+', '-', '*', '/']
    301. res = [s]
    302. result = []
    303. for op in b:
    304. n_res = []
    305. for item in res:
    306. n_res.extend(item.split(op))
    307. res = n_res
    308. for item in res:
    309. if ' as ' not in item:
    310. result.append(re.findall(r'^[\w+_*]+$', item.replace(' ', ''))[0])
    311. result = list(set(res))
    312. return result
    313. def extact_func(self, s, func_name):
    314. res = []
    315. s = s[:-1].replace(f'{func_name}(', '', 1)
    316. com_idx = [i for i, v in enumerate(s) if v == ',']
    317. jd_com_idx = []
    318. for i in com_idx:
    319. s1 = s[0:i]
    320. if s1.count('(') == s1.count(')'):
    321. jd_com_idx.append(i)
    322. jd_com_idx.append(len(s))
    323. jd_com_idx.insert(0, -1)
    324. for i in range(1, len(jd_com_idx)):
    325. res.append(s[jd_com_idx[i - 1] + 1:jd_com_idx[i]])
    326. return res
    327. def sort_funcs(self, li):
    328. li = sorted(li, key=lambda x: x.count('('), reverse=True)
    329. li_n = []
    330. for l in li:
    331. li_n.append(l)
    332. return li_n
    333. def delete_text(self, event):
    334. self.entry.delete(0, END)
    335. self.filename = ''
    336. def trans(self):
    337. if self.filename:
    338. data = open(self.filename, 'r', encoding='utf-8').readlines()
    339. self.folder_path = os.path.dirname(self.filename)
    340. file_res = self.folder_path + r'\hive转presto_res.sql'
    341. os.startfile(f'{self.folder_path}')
    342. else:
    343. data = self.entry.get().split('\n')
    344. file_res = self.dir + r'\hive转presto_res.sql'
    345. data_n = []
    346. for s in data:
    347. if not s.rstrip(' \n'):
    348. continue
    349. if '”' in s:
    350. s = s.replace('“', '')
    351. if ',' in s:
    352. s = s.replace(',', ',')
    353. if '(' in s:
    354. s = s.replace('(', '(')
    355. if ')' in s:
    356. s = s.replace(')', ')')
    357. if (idx := s.find('--')) == -1:
    358. data_n.append(s + '\n')
    359. else:
    360. data_n.append(s[:idx] + '\n')
    361. data = ''.join(data_n)
    362. res1, res2, ops, adj = self.findvar(data)
    363. for i, ss in enumerate(res1):
    364. s_n = res2[i]
    365. s_n1 = res2[i]
    366. s_n2 = res2[i]
    367. s_t = res2[i]
    368. flag = 0
    369. for elem in ss:
    370. elem1 = elem.replace(' ', '')
    371. if any(c.isalpha() for c in elem1):
    372. if ops[i] in ['=', '!=', '>', '<', '<=', '>=', '<>']:
    373. if adj[i]:
    374. if elem1 not in self.time_funcs:
    375. s_n = re.sub(rf'\b{elem}\b', f'cast(substr({elem1},1,10) as date', s_n)
    376. continue
    377. else:
    378. continue
    379. if any(op in s_t for op in ['+', '-', '*', '/']):
    380. s_n = re.sub(rf'\b{elem}\b', f'cast({elem1} as double)', s_n)
    381. else:
    382. s_n = re.sub(rf'\b{elem}\b', f'cast({elem1} as varchar)', s_n)
    383. else:
    384. if elem.strip() not in ['(', ')']:
    385. s_n = re.sub(rf'\b{elem}\b', f'cast({elem1} as double)', s_n)
    386. flag += 1
    387. data = data.replace(res2[i].strip(), s_n)
    388. if flag == 2:
    389. if any(op in s_t for op in ['+', '-', '*', '/']):
    390. s_n1 = re.sub(rf'\b{ss[0]}\b', f'cast({ss[0]} as double)', s_n)
    391. s_n2 = re.sub(rf'\b{ss[1]}\b', f'cast({ss[1]} as double)', s_n)
    392. else:
    393. s_n1 = re.sub(rf'\b{ss[0]}\b', f'cast({ss[0]} as varchar)', s_n)
    394. s_n2 = re.sub(rf'\b{ss[1]}\b', f'cast({ss[1]} as varchar)', s_n)
    395. data = data.replace(s_n1, s_n)
    396. data = data.replace(s_n2, s_n)
    397. self.error = []
    398. self.result = []
    399. for func_name in self.funcs:
    400. r = [m.start() for m in re.finditer(func_name, data.lower())]
    401. for idx in r:
    402. n = 1
    403. while True:
    404. s = data[idx:idx + n]
    405. if (s.count(')') == s.count('(') and s.count(')') != 0) and idx + n > len(data):
    406. break
    407. n += 1
    408. if s not in self.result and s.rstrip(' \n')[len(func_name)] == '(':
    409. self.result.append(s)
    410. self.result = self.sort_funcs(self.result)
    411. res = self.replace_func(data, self.result)
    412. res_new = []
    413. for r in res.split('\n'):
    414. if r.rstrip(' \n'):
    415. res_new.append(r)
    416. res_new = '\n'.join(res_new)
    417. self.text.delete("1.0", END)
    418. self.text.insert("end", f'{res_new}')
    419. with open(file_res, 'w', encoding='utf-8') as f:
    420. f.write(res_new)
    421. if __name__ == '__main__':
    422. pro = Hive2Presto()
    423. pro.__int__()
    424. pro.main()

    效果如下所示:

  • 相关阅读:
    Numerical Calculation 数值计算
    MySQL约束之check
    Rethinking the Inception Architecture for Computer Vision--Christian Szegedy
    Ubuntu 20.04上安装Datalog Disassembly工具的过程
    python自我学习 二 05 下载图片链接
    Springboot 基本概念
    《Unity3D高级编程之进阶主程》第一章 C#要点技术(五) 排序算法
    关于Synchronized你了解多少?
    MySQL常用语句一章解读数据库的底层原理
    【app篇】写个简单的BLE调试app,练练手,同时为后续调试ESP32 BLE做个支持
  • 原文地址:https://blog.csdn.net/Sanfenpai6/article/details/133804941