项目需要对用户提交的附件、文章、搜索框内容等做热词分析。如下图:
公司有大数据团队。本着不麻烦别人就不麻烦别人的原则,写了一款简易的记录工具,原理也简单,手工在业务插入锚点,用分词器分好词,排掉字母、数字、符号、敏感词。将词汇按年度累加记录到数据库中即可。代码如下:
@Component
public class HotWordHelper {
private static HotWordMapper hotWordMapper;
static List<Character> FILTER_CHARS = new ArrayList<>();
static {
String number = "123456789abcdefghijklnmopqrstuvwxyzABCDEFGHIJKLNMOPQRSTUVWXYZ";
char[] chars = number.toCharArray();
for (char aChar : chars) {
FILTER_CHARS.add(aChar);
}
}
@Autowired
public ZYHotWordHelper(HotWordMapper hotWordMapper) {
ZYHotWordHelper.hotWordMapper = hotWordMapper;
}
public static List<HotWord> loaderHotWordTen(String moduleCode) {
LocalDate now = LocalDate.now();
int year = now.getYear();
return loaderHotWord(year, 10, moduleCode);
}
public static List<HotWord> loaderHotWord(int top, String moduleCode) {
LocalDate now = LocalDate.now();
int year = now.getYear();
return loaderHotWord(year, top, moduleCode);
}
public static List<HotWord> loaderHotWord(int year, int top, String moduleCode) {
LambdaQueryWrapper<HotWord> wrapper = Wrappers.lambdaQuery();
wrapper.eq(HotWord::getRecordYear, year);
wrapper.eq(HotWord::getModuleCode, moduleCode);
return hotWordMapper.selectTop(wrapper, HotWord::getAppearTimes, top);
}
// 直接词汇,如字典之类的。
public static void putDirectHotWord(String text, String moduleCode) {
// 为不影响主业务速度,改成异常
Runnable runnable = () -> putHotWord(true, text, moduleCode);
AsyncExecutor.execute(runnable);
}
// 分析词汇
public static void putAnalyzeHotWord(String text, String moduleCode) {
// 为不影响主业务速度,改成异常
Runnable runnable = () -> putHotWord(false, text, moduleCode);
AsyncExecutor.execute(runnable);
}
// 附件
public static void putAttachmentAsync(StringsField attachmentIds, String moduleCode) {
if (ZYListUtils.isEmptyList(attachmentIds)) {
return;
}
Runnable runnable = () -> doPutAttachmentAsync(attachmentIds, moduleCode);
AsyncExecutor.execute(runnable);
}
// 解析附件
private static void doPutAttachmentAsync(StringsField attachmentIds, String moduleCode) {
FileInfoMapper fileInfoMapper = SpringUtils.getBean(FileInfoMapper.class);
List<FileInfo> fileInfos = fileInfoMapper.selectBatchIds(attachmentIds);
if (ZYListUtils.isEmptyList(fileInfos)) {
return;
}
FileStoreService storeService= ZYSpringUtils.getBean(FileStoreService.class);
List<FileWrapper> fileWrappers = ZYListUtils.list2list(fileInfos, FileInfo::toFileWrapper);
for (FileWrapper fileWrapper : fileWrappers) {
try (InputStream objectStream = storeService.getObjectStream(fileWrapper)) {
String text = IOUtils.toString(objectStream, StandardCharsets.UTF_8);
putAnalyzeHotWord(text, moduleCode);
} catch (Exception e) {
return;
}
}
}
private static void putHotWord(boolean isDirect, String text, String moduleCode) {
if (ZYStrUtils.isAnyNull(text, moduleCode)) {
return;
}
List<String> words = analyzerWords(isDirect, text);
if (ZYListUtils.isEmptyList(words)) {
return;
}
// List smallWordCompare = new ArrayList<>(words);
words.removeIf(w -> {
if (!matchLength(w)) {
return true;
}
char[] chars = w.toCharArray();
for (char aChar : chars) {
// 不要数字字母
if (FILTER_CHARS.contains(aChar)) {
return true;
}
}
// 存在误判,还是不用这段代码
/* for (String compareWord : smallWordCompare) {
if (!w.equals(compareWord) && compareWord.contains(w)) {
return true;
}
}
*/
return false;
});
Map<String, Integer> wordCount = ZYMapUtils.countField(words, w -> w);
LocalDate now = LocalDate.now();
int year = now.getYear();
LambdaQueryWrapper<HotWord> wrapper = Wrappers.lambdaQuery();
wrapper.in(HotWord::getHotWord, words);
wrapper.eq(HotWord::getRecordYear, year);
wrapper.eq(HotWord::getModuleCode, moduleCode);
List<HotWord> existsWords = hotWordMapper.selectList(wrapper);
Map<String, HotWord> wordIdContainer = ZYListUtils.groupModel(existsWords, HotWord::getHotWord);
List<HotWord> addHotWords = new ArrayList<>();
List<HotWord> editHotWords = new ArrayList<>();
wordCount.forEach((w, times) -> {
HotWord hotWord = wordIdContainer.get(w);
if (null != hotWord) {
Integer appearTimes = hotWord.getAppearTimes();
appearTimes += times;
hotWord.setAppearTimes(appearTimes);
editHotWords.add(hotWord);
} else {
HotWord newHotWord = new HotWord();
newHotWord.setRecordYear(year);
newHotWord.setAppearTimes(times);
newHotWord.setHotWord(w);
newHotWord.setModuleCode(moduleCode);
addHotWords.add(newHotWord);
}
});
if (ZYListUtils.isNotEmptyList(addHotWords)) {
hotWordMapper.insertBatch(addHotWords);
}
if (ZYListUtils.isNotEmptyList(editHotWords)) {
for (HotWord editHotWord : editHotWords) {
hotWordMapper.updateById(editHotWord);
}
}
}
private static List<String> analyzerWords(boolean isDirect, String text) {
List<String> words;
if (isDirect) {
words = Collections.singletonList(text);
} else {
words = ZYDirtyWordHelper.analyze(text);
if (matchLength(text) && !words.contains(text)) {
words.add(text);
}
}
return words;
}
private static boolean matchLength(String text) {
int length = text.length();
return length > 1 && length < 6;
}
}
记录表中效果图,实际效果还阔以,实时性和记录速度都非常快。