使用编辑距离实现英语单词纠错。
实现:
1、先面向过程实现即可
2、后面向对象实现即可
准备工作:准备一个英语词典
功能:纠错–>纠错器/纠错工具
英语单词纠错器:WordsCorrector
类的组成要素:WordsCorrector
1、词典
访问类型:private
类型:List
成员方法/成员函数
2、构造函数/初始化函数
功能:做准备工作,读取单词文件,初始化成员变量
访问类型:public
形参:string(文件路径)
返回值:void
3、纠错函数
访问类型:public
形参:string
返回值:string
4、单词拆分函数
访问类型:private
形参:string
返回值:List
5、编辑距离函数 private
访问类型:private
形参:string,string
返回值:int
备注:
输入:用户待纠错的句子
输出:纠正后的句子
package entity;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 单词表
* @author admin
*
*/
public class WordsCorrector {
// english词典
private Set<String> englishDict;
// 构造函数/初始化
public WordsCorrector(String path){
Set<String> wordSet = null;
// 要读取的文件路径,这里自行更改
File file = new File(path);
try {
// 读取文件输入流
InputStreamReader read = new InputStreamReader(new FileInputStream(
file), "utf-8");
// 文件是否是文件 和 是否存在
if (file.isFile() && file.exists()) {
wordSet = new HashSet<String>();
//字符缓存输入流
BufferedReader br = new BufferedReader(read);
String txt = null;
// 读取文件,将文件内容放入到set中
while ((txt = br.readLine()) != null) {
wordSet.add(txt);
}
br.close();
}
// 关闭文件流
read.close();
} catch (Exception e) {
e.printStackTrace();
}
this.englishDict = wordSet;
}
// 纠错函数
public String Correct(String sentence){
String correctRes = "";
//分词
List<String> words = getEnglishCharacter(sentence);
//纠错
int i=0;
String regex = "^[a-zA-Z]+$";
for(String str : words){
if(str.matches(regex)){
if(!this.englishDict.contains(str)){
words.set(i, getCorrectWord(str));
}
}
i++;
}
//纠错结果组合
int next_count = 0;
for(String str:words){
next_count++;
if(str.matches(regex) && words.get(next_count).matches(regex)){
//System.out.print(str+" ");
correctRes += str+" ";
}
else{
if(!str.matches(regex) && next_count != words.size()){
//System.out.print(str + " ");
correctRes += str+" ";
}else{
//System.out.print(str);
correctRes += str;
}
}
}
return correctRes;
}
private String getCorrectWord(String str){
String res = "";
int nMax = 10000;
// Set readWordFile = readWordFile();
int count = 0;
for(String cidian : this.englishDict){
count++;
int nEdit = getEditInstance(cidian, str);
if (nEdit < nMax){
res = cidian;
nMax = nEdit;
//System.out.println(str + "-->" + cidian + ": " + nMax);
}
}
return res;
}
private int getEditInstance(String cidian, String str){
int c[][]=new int[cidian.length()+1][str.length()+1];
for(int i = 0; i <= cidian.length(); i++){
c[i][0] = i;
}
for(int i = 0; i <= str.length(); i++){
c[0][i] = i;
}
for (int i = 1; i <=cidian.length(); i++) {
for (int j = 1; j <=str.length(); j++) {
int diff;
if(cidian.charAt(i-1)==str.charAt(j-1)){
diff=0;
}else{
diff=1; //当新增序列不想等时,进行互换,新增,删除的操作
}
int temp = Math.min(c[i-1][j]+1,c[i][j-1]+1); //c[i][j-1] 为插入一个字符
c[i][j] = Math.min(temp, c[i-1][j-1]+diff);
}
}
return c[cidian.length()][str.length()];
}
//分词
private List<String> getEnglishCharacter(String str) {
List<String> charList = new ArrayList<>();
/*正则表达式:句子结束符*/
String regEx=",|\\.|\\?|!|:|;|~|,|:|。|!|;|?| ";
Pattern p =Pattern.compile(regEx);
Matcher m = p.matcher(str);
/*按照句子结束符分割句子*/
String[] words = p.split(str); //trim()
String[] chars=new String[words.length];
/*将句子结束符连接到相应的句子后*/
if(words.length > 0)
{
int count = 0;
while(count < words.length)
{
if(m.find())
{
words[count] += m.group();
}
count++;
}
}
/*输出结果*/
int nValidCount = 0;
for (int i = 0; i < words.length; i++) {
String tmp = words[i].trim();
//System.out.println("split res 1: "+tmp);
if(tmp.length() == 0) continue;
chars[nValidCount]=tmp;
//System.out.println("split res: "+tmp);
nValidCount++;
}
String regex = "^[a-zA-Z]+$";
for (int i = 0; i < nValidCount/*chars.length*/; i++) {
//System.out.println("out of range:"+chars[i]);
if(!chars[i].matches(regex)){
charList.add(chars[i].substring(0,chars[i].length()-1).toLowerCase());
charList.add(chars[i].substring(chars[i].length()-1));//.toLowerCase());
}else{
charList.add(chars[i].toLowerCase());
}
}
return charList;
}
}