目录
原始数据集中,数据的标注是以word为单位,且计数从1开始的。如下图:

如何修改为以char为单位的,从0开始计数的数据格式:

- # 将英文数据集中,原本按照word存储的数据集,按照char格式存储,处理成与中文一样的格式
- path = '/home/qtxu/Sentiment-SPN/data/Camera-COQE/dev.txt'
- path_w = '/home/qtxu/Sentiment-SPN/data/Camera-COQE/dev_char.txt'
-
- from pdb import set_trace as stop
-
- def obtain_index(cur_ele): # '[10&&would 11&¬ , 17&&difference]' ## '[10&¬ , 16&&clearer]' ##[13&&did 14&&n't , 20&&as 21&&well 22&&as]
- len_ele = len(cur_ele)
-
- if ' ,' in cur_ele: # 针对几个带有逗号的特殊处理
- start_index = cur_ele.find(',')
- cur_ele = '['+cur_ele[start_index+2:]
-
-
- if len_ele == 2:
- index_list = []
- span_str = ''
- return index_list,span_str
- else:
- cur_ele = cur_ele[1:-1]
- # try:
- index_list = [int(ele.split('&&')[0])-1 for ele in cur_ele.split(' ')]
- span_str = ' '.join(ele.split('&&')[1] for ele in cur_ele.split(' '))
- # except:
- # stop()
- return index_list, span_str
-
-
- def word_to_char(sentence, span, span_index):
- if len(span)==0:
- return '[]'
- else:
- span_start_index = span_index[0]
- front_str = ' '.join(sentence.split(' ')[:span_start_index])
- span_len = len(front_str)
- result_str = ""
- if span_start_index == 0:
- i = 0
- else:
- i = 1
-
- for char in span:
- start_index = span_len + i
- cur_char = f"{start_index}&&{char} "
- result_str += cur_char
- i += 1
-
- # 移除末尾的空格
- result_str = '['+result_str.rstrip()+']'
- return result_str
-
- with open(path, 'r') as fr, open(path_w, 'w') as fw:
- lines = fr.readlines()
- for line in lines:
- try:
- sent, label = line.strip().split('\t')
- fw.write(line)
- except:
- if '[[];[];[];[];[]]' in line:
- fw.write(line)
- else:
- # stop()
-
- cur_line = line.strip()[1:-1]
- sub,obj,asp,op,polarity = cur_line.split(';')
- sub_index, sub_span = obtain_index(sub)
- obj_index, obj_span = obtain_index(obj)
- asp_index, asp_span = obtain_index(asp)
- op_index, op_span = obtain_index(op)
- sub_char= word_to_char(sent, sub_span,sub_index)
- obj_char= word_to_char(sent, obj_span,obj_index)
- asp_char= word_to_char(sent, asp_span,asp_index)
- op_char= word_to_char(sent, op_span,op_index)
- char_quintuple = '['+ str(sub_char) + ';' + str(obj_char) +';'+ str(asp_char) +';'+str(op_char) +';' + polarity + ']'
- # polarity
- fw.write(char_quintuple+'\n')
- # print(sub_char)