使用编辑距离实现英语单词纠错-面向对象实现

任务：

使用编辑距离实现英语单词纠错。
实现：
1、先面向过程实现即可
2、后面向对象实现即可

面向过程

准备工作：准备一个英语词典

功能：纠错–>纠错器/纠错工具
英语单词纠错器：WordsCorrector
类的组成要素：WordsCorrector

成员变量

1、词典
访问类型：private
类型：List
成员方法/成员函数

2、构造函数/初始化函数
功能：做准备工作，读取单词文件，初始化成员变量
访问类型：public
形参：string(文件路径)
返回值：void

3、纠错函数
访问类型：public
形参：string
返回值：string

4、单词拆分函数
访问类型：private
形参：string
返回值：List

5、编辑距离函数 private
访问类型：private
形参：string，string
返回值：int

备注：
输入：用户待纠错的句子
输出：纠正后的句子

package entity;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 单词表
 * @author admin
 *
 */
public class WordsCorrector {
	// english词典
	private Set<String> englishDict;
	// 构造函数/初始化
	public WordsCorrector(String path){
		Set<String> wordSet = null;
		// 要读取的文件路径，这里自行更改
		File file = new File(path);
		try {
			// 读取文件输入流
			InputStreamReader read = new InputStreamReader(new FileInputStream(
			file), "utf-8");
			// 文件是否是文件 和 是否存在
			if (file.isFile() && file.exists()) {		 
				wordSet = new HashSet<String>();
				//字符缓存输入流
				BufferedReader br = new BufferedReader(read);
				String txt = null;	 
				// 读取文件，将文件内容放入到set中
				while ((txt = br.readLine()) != null) {
					wordSet.add(txt);
				}
				br.close();
			}
			// 关闭文件流
			read.close();
		} catch (Exception e) {
			e.printStackTrace();
		}	 
		this.englishDict = wordSet;
	}
	
	// 纠错函数
	public String Correct(String sentence){
		String correctRes = "";
		//分词
		List<String> words = getEnglishCharacter(sentence);
		//纠错
		int i=0;
		String regex = "^[a-zA-Z]+$";
        for(String str : words){   
        	if(str.matches(regex)){
        		if(!this.englishDict.contains(str)){
        			words.set(i, getCorrectWord(str));
        		}
        	}
    		i++;
        }
		//纠错结果组合
        int next_count = 0;
        for(String str:words){
        	next_count++;
        	if(str.matches(regex) && words.get(next_count).matches(regex)){
        		//System.out.print(str+" ");
        		correctRes += str+" ";
        	}
        	else{
        		if(!str.matches(regex) && next_count != words.size()){
        			//System.out.print(str + " ");
        			correctRes += str+" ";
        		}else{
        			//System.out.print(str);
        			correctRes += str;
        		}
        	}
        }
        
		return correctRes;
	}
	
	private String getCorrectWord(String str){
		String res = "";
		int nMax = 10000;
//		Set readWordFile = readWordFile();
		int count = 0;
		for(String cidian : this.englishDict){
			count++;
			int nEdit = getEditInstance(cidian, str);
			if (nEdit < nMax){
				res = cidian;
				nMax = nEdit;
	        	//System.out.println(str + "-->" + cidian + ": " + nMax);
			}
		}
		
		return res;
	}
	
	private int getEditInstance(String cidian, String str){
		int c[][]=new int[cidian.length()+1][str.length()+1];
		for(int i = 0; i <= cidian.length(); i++){
			c[i][0] = i;
		}
		for(int i = 0; i <= str.length(); i++){
			c[0][i] = i;
		}
		
		for (int i = 1; i <=cidian.length(); i++) {
			for (int j = 1; j <=str.length(); j++) {
				int diff;
				if(cidian.charAt(i-1)==str.charAt(j-1)){
					diff=0;
				}else{
					diff=1;					//当新增序列不想等时，进行互换，新增，删除的操作 
				}
				int temp = Math.min(c[i-1][j]+1,c[i][j-1]+1);   //c[i][j-1] 为插入一个字符
				c[i][j] = Math.min(temp, c[i-1][j-1]+diff);
			}	
		}		
		return c[cidian.length()][str.length()];
	}
	
	//分词
	private List<String> getEnglishCharacter(String str) {
        List<String> charList = new ArrayList<>();
        /*正则表达式：句子结束符*/  
		String regEx=",|\\.|\\?|!|:|;|~|，|：|。|！|；|？| ";    
		Pattern p =Pattern.compile(regEx);   
		Matcher m = p.matcher(str);   
		/*按照句子结束符分割句子*/  
		String[] words = p.split(str);  //trim() 
		String[] chars=new String[words.length];
		/*将句子结束符连接到相应的句子后*/  
		if(words.length > 0)   
		{   
		    int count = 0;   
		    while(count < words.length)   
		    {   
		        if(m.find())   
		        {   
		            words[count] += m.group();   
		        }   
		        count++;   
		    }   
	    }
		/*输出结果*/ 
		int nValidCount = 0;
		for (int i = 0; i < words.length; i++) {
			String tmp = words[i].trim();
			//System.out.println("split res 1: "+tmp);
			if(tmp.length() == 0) continue;
			chars[nValidCount]=tmp;
			//System.out.println("split res: "+tmp);
			nValidCount++;
		}
		
        String regex = "^[a-zA-Z]+$";
		for (int i = 0; i < nValidCount/*chars.length*/; i++) {
			//System.out.println("out of range:"+chars[i]);
			if(!chars[i].matches(regex)){
				charList.add(chars[i].substring(0,chars[i].length()-1).toLowerCase());
				charList.add(chars[i].substring(chars[i].length()-1));//.toLowerCase());
			}else{
			charList.add(chars[i].toLowerCase());
			}
		}
        return charList;
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

相关阅读:
响应式驱动与双向数据绑定
 一起Talk Android吧（第三百六十四回：多线程之同步块）
IDEA上MySQL的jar包导入教程
 STM32——汉字显示
 MyBatis批量更新
 黄州科目三
 朴素贝叶斯分类算法流程——python
【COMSE 2022已检索】2023年材料科学与工程国际会议（COMSE 2023）火热征稿中！
NV040D语音芯片：智能破壁机，悦享美好生活
 最新SQL注入漏洞修复建议
原文地址：https://blog.csdn.net/u010212101/article/details/126040982