通过正则批量提取PDF中文本信息
public static void main(String[] args) throws IOException {
// 这是预设的正则,可以通过预设不同的正则提取不同内容(电话号码、邮箱等等)
String[] options = {"([0-9]{6}.+,00)", "待设定(正则)", "待设定(正则)"};
// 选项弹窗
int selectedOption = JOptionPane.showOptionDialog(null, "请选择一个选项", "选项框", JOptionPane.DEFAULT_OPTION, JOptionPane.QUESTION_MESSAGE, null, options, options[0]);
// 获取选项值
String selectedValue = "";
if (selectedOption != JOptionPane.CLOSED_OPTION) {
selectedValue = options[selectedOption];
} else {
// 如果没有选择,则可以手动输入正则
selectedValue = JOptionPane.showInputDialog(null, "请输入您的文本:","([0-9]{6}.+,00)");
}
// 通过文件选择器选择pdf文件
JFileChooser chooser = new JFileChooser();
// 过滤pdf文件
FileNameExtensionFilter filter = new FileNameExtensionFilter("PDF Files", "pdf");
chooser.setFileFilter(filter);
int returnVal = chooser.showOpenDialog(null);
if (returnVal == JFileChooser.APPROVE_OPTION) {
// 根据选择的文件获取pdf文本
File selectedFile = chooser.getSelectedFile();
PDDocument document = PDDocument.load(new File(selectedFile.getPath()));
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
// 通过正则从文本中获取内容
Pattern pattern = Pattern.compile(selectedValue);
Matcher matcher = pattern.matcher(text);
String fileName = new File(selectedFile.getPath()).getName();
// 将提取的内容写入文件
String outputFileName = fileName.substring(0, fileName.lastIndexOf(".")) + ".txt";
FileWriter writer = new FileWriter(outputFileName);
while (matcher.find()) {
String group = matcher.group();
writer.write(group);
writer.write("\n");
}
writer.close();
document.close();
} else {
// TODO 换成提示框
System.out.println("File selection cancelled.");
}
}

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48