讯飞大数据竞赛2022 汽车领域多语种迁移学习打卡博客

import  pandas as pd
import numpy as np
1
2

# 打开数据 （任务一）
1

train_zh= pd.read_excel("data/中文_trian.xlsx")
train_zh.head()
1
2

	原始文本	意图	槽值1	槽值2
0	16.5度	adjust_ac_temperature_to_number	offset:16.5	NaN
1	16度	adjust_ac_temperature_to_number	offset:16	NaN
2	16空调开到16度	adjust_ac_temperature_to_number	offset:16	NaN
3	16温度16度	adjust_ac_temperature_to_number	offset:16	NaN
4	17度	adjust_ac_temperature_to_number	offset:17	NaN

train_en = pd.read_excel("data/英文_train.xlsx")
train_en.head()
1
2

	原始文本	中文翻译	意图	槽值1	槽值2
0	open aircon please	请打开空调	open_ac	NaN	NaN
1	I want to activate the AC	我想打开空调	open_ac	NaN	NaN
2	I want to turn on the air conditioner	我想打开空调	open_ac	NaN	NaN
3	switch on the AC please	请打开空调	open_ac	NaN	NaN
4	Help me open the AC	帮我打开空调	open_ac	NaN	NaN

train_ja = pd.read_excel("data/日语_train.xlsx")
train_ja.head()
1
2

	原始文本	中文翻译	意图	槽值1	槽值2
0	エアコンのスイッチONに	打开空调开关	open_ac	NaN	NaN
1	エアコン入れる	打开空调	open_ac	NaN	NaN
2	エアコンのスイッチを	打开空调开关	open_ac	NaN	NaN
3	エアコンのスイッチ入れる	打开空调开关	open_ac	NaN	NaN
4	エアコンのスイッチON	打开空调开关	open_ac	NaN	NaN

test_ja = pd.read_excel("data/testA.xlsx",sheet_name="日语_testA")
test_en = pd.read_excel("data/testA.xlsx",sheet_name="英文_testA")
1
2

test_ja.head()
1

	原始文本
0	エアコンをつける
1	エアコン付ける
2	エアコンをつけてください
3	エアコンをONに
4	エアコン

test_en.head()
1

	原始文本
0	switch on the AC
1	air conditioner open
2	Turn on the AC please
3	I wanna switch on aircon please
4	Help me switch on aircon

# 分词（任务二）
import jieba
import jieba.posseg as pseg
import nagisa  # 日语分词包
1
2
3
4

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.
1
2
3

# 测试日语分词
text = 'Pythonで簡単に使えるツールです'
words = nagisa.tagging(text)
print(words.words)
1
2
3
4

['Python', 'で', '簡単', 'に', '使える', 'ツール', 'です']
1

# 测试中文分词
words = jieba.lcut("查看训练集和测试集字段类型，并将数据读取代码写到博客；")
print(words)
1
2
3

['查看', '训练', '集', '和', '测试', '集字段', '类型', '，', '并', '将', '数据', '读取', '代码', '写', '到', '博客', '；']
1

words = pseg.lcut("查看训练集和测试集字段类型，并将数据读取代码写到博客；")
print(words)
1
2

[pair('查看', 'v'), pair('训练', 'vn'), pair('集', 'q'), pair('和', 'c'), pair('测试', 'vn'), pair('集字段', 'n'), pair('类型', 'n'), pair('，', 'x'), pair('并', 'c'), pair('将', 'd'), pair('数据', 'n'), pair('读取', 'v'), pair('代码', 'n'), pair('写', 'v'), pair('到', 'v'), pair('博客', 'nr'), pair('；', 'x')]
1

# 使用tf_idf 抽取特征 使用逻辑回归进行意图分类
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
1
2
3
4
5

train_ja["words"] = train_ja["原始文本"].apply(lambda x: " ".join(nagisa.tagging(x).words))
train_en["words"] = train_en["原始文本"].apply(lambda x: x.lower())
1
2

test_ja['words'] = test_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
test_en['words'] = test_en['原始文本'].apply(lambda x: x.lower())
1
2

# 构建模型
pipline = make_pipeline(TfidfVectorizer(),LogisticRegression())
pipline.fit(train_ja["words"].to_list() + train_en["words"].to_list(),
           train_ja["意图"].to_list() + train_en["意图"].to_list())
1
2
3
4

test_ja["意图"] = pipline.predict(test_ja['words'])
test_en["意图"] = pipline.predict(test_en['words'])
test_en['槽值1'] = np.nan
test_en['槽值2'] = np.nan
test_ja['槽值1'] = np.nan
test_ja['槽值2'] = np.nan
1
2
3
4
5
6

writer = pd.ExcelWriter("submit_LR.xlsx")
test_en.drop(["words"],axis=1).to_excel(writer, sheet_name='英文_testA', index=None)
test_ja.drop(["words"],axis=1).to_excel(writer,sheet_name="日语_testA",index=None)
writer.save()
writer.close()
1
2
3
4
5

提交结果：
在这里插入图片描述

相关阅读:
【前端】判断是否为对象
javaEE - 1（9000字详解多线程第一篇）
mysql中的几种排名函数
【无标题】
Docker虚拟机no-avx环境安装mxnet-noavx
开源博客项目Blog .NET Core源码学习（18：App.Hosting项目结构分析-6）
sknearl-7处理文本数据
QFile（文件）
【redis】7.6 安装与配置Redis - （docker-compose）
Spring自定义启动图标（个性化启动图标）

原文地址：https://blog.csdn.net/hfutzhouyonghang/article/details/125491772

讯飞大数据竞赛2022 汽车领域多语种迁移学习 打卡博客

讯飞大数据竞赛2022 汽车领域多语种迁移学习打卡博客