poi解析大文件可能出现oom,通样大小文件,xlsx会oom,xls不会,所以使用流式的方式改造解析xlsx文件的代码。
我的需求是提取每一页的表头,所以值提取第一行,如果需要全部内容,把行数循环地方的if改成while就行。
产生Workbook给的入参都是给的File,没用Inputstream,原因见后两章
可能不全,按需引入更多poi相关依赖
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poiartifactId>
<version>4.1.2version>
dependency>
<dependency>
<groupId>com.monitorjblgroupId>
<artifactId>xlsx-streamerartifactId>
<version>2.2.0version>
dependency>
//newExcel true表示xlsx,false表示xls
private static String excel(File file, boolean newExcel) throws Exception {
StringBuilder sb = new StringBuilder();
Workbook workbook = null;
try {
if (newExcel) {
//我只取第一行,你们按需调大
workbook = StreamingReader.builder()
.rowCacheSize(1)
.bufferSize(10240)
.open(file);
} else {
//xls的还是普通方式
workbook = WorkbookFactory.create(file);
}
getSheet(sb, workbook);
} finally {
if (workbook != null) {
workbook.close();
}
}
return sb.toString();
}
private static void getSheet(StringBuilder sb, Workbook workbook) throws IOException {
try {
sb.append(LEFT_MIDDLE_BRACKET);
Iterator<Sheet> sheetIterator = workbook.iterator();
while (sheetIterator.hasNext()) {
Sheet sheet = sheetIterator.next();
String sheetName = sheet.getSheetName();
if (sheetName.startsWith(BTXA_MASK)) {
continue;
}
sb.append(sheetName).append(C_COLON);
Iterator<Row> rowIterator = sheet.iterator();
//想取所有行的人把if改成while
if (rowIterator.hasNext()) {
Row row = rowIterator.next();
firstLine(sb, row);
if (sheetIterator.hasNext()) {
sb.append(C_SEMICOLON);
}
}
}
sb.append(RIGHT_MIDDLE_BRACKET);
} finally {
workbook.close();
}
}
private static void firstLine(StringBuilder sb, Row row) {
try {
int numOfColumns = row.getLastCellNum();
for (int j = 0; j < numOfColumns; j++) {
Cell cell = row.getCell(j, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
sb.append(getCell(cell));
if (j != numOfColumns - 1) {
sb.append(C_COMMA);
}
}
} catch (Exception e) {
log.error("异常列", e);
sb.append("异常列");
}
}
private static String getCell(Cell cell) {
// 获取单元格类型
CellType cellType = cell.getCellType();
// 根据单元格类型处理值
String cellValue;
switch (cellType) {
case STRING:
cellValue = cell.getStringCellValue();
break;
case NUMERIC:
cellValue = String.valueOf(cell.getNumericCellValue());
break;
case BOOLEAN:
cellValue = String.valueOf(cell.getBooleanCellValue());
break;
case FORMULA:
try {
DataFormatter formatter = new DataFormatter();
cellValue = formatter.formatCellValue(cell);
} catch (IllegalStateException | NumberFormatException e) {
cellValue = cell.getCellFormula();
}
break;
case BLANK:
// 空单元格
cellValue = "";
break;
case ERROR:
// 错误值
cellValue = "ERROR (" + ErrorEval.getText(cell.getErrorCellValue()) + ")";
break;
default:
cellValue = "特殊类型";
}
return cellValue;
}
open方法使用InputStream还需要创建临时文件,如果有本地文件,直接给file
/**
* Reads a given {@code InputStream} and returns a new
* instance of {@code Workbook}. Due to Apache POI
* limitations, a temporary file must be written in order
* to create a streaming iterator. This process will use
* the same buffer size as specified in {@link #bufferSize(int)}.
*
* @param is input stream to read in
* @return A {@link Workbook} that can be read from
* @throws com.monitorjbl.xlsx.exceptions.ReadException if there is an issue reading the stream
*/
public Workbook open(InputStream is) {
StreamingWorkbookReader workbook = new StreamingWorkbookReader(this);
workbook.init(is);
return new StreamingWorkbook(workbook);
}
/**
* Reads a given {@code File} and returns a new instance
* of {@code Workbook}.
*
* @param file file to read in
* @return built streaming reader instance
* @throws com.monitorjbl.xlsx.exceptions.OpenException if there is an issue opening the file
* @throws com.monitorjbl.xlsx.exceptions.ReadException if there is an issue reading the file
*/
public Workbook open(File file) {
StreamingWorkbookReader workbook = new StreamingWorkbookReader(this);
workbook.init(file);
return new StreamingWorkbook(workbook);
}
create方法说明InputStream比file更占内存
/**
* Creates the appropriate HSSFWorkbook / XSSFWorkbook from
* the given InputStream.
*
* Your input stream MUST either support mark/reset, or
* be wrapped as a {@link BufferedInputStream}!
* Note that using an {@link InputStream} has a higher memory footprint
* than using a {@link File}.
*
* Note that in order to properly release resources the
* Workbook should be closed after use. Note also that loading
* from an InputStream requires more memory than loading
* from a File, so prefer {@link #create(File)} where possible.
*
* @param inp The {@link InputStream} to read data from.
*
* @return The created Workbook
*
* @throws IOException if an error occurs while reading the data
* @throws EncryptedDocumentException If the Workbook given is password protected
*/
public static Workbook create(InputStream inp) throws IOException, EncryptedDocumentException {
return create(inp, null);
}
/**
* Creates the appropriate HSSFWorkbook / XSSFWorkbook from
* the given File, which must exist and be readable.
* Note that in order to properly release resources the
* Workbook should be closed after use.
*
* @param file The file to read data from.
*
* @return The created Workbook
*
* @throws IOException if an error occurs while reading the data
* @throws EncryptedDocumentException If the Workbook given is password protected
*/
public static Workbook create(File file) throws IOException, EncryptedDocumentException {
return create(file, null);
}
赠送一个csv方法
private static String csv(File file) throws Exception {
String encode = KkFileUtils.getFileEncode(file);
log.info("编码检测:{}", encode);
if ("Big5".equalsIgnoreCase(encode)) {
encode = "GB2312";
}
String res = "";
try (FileInputStream fo = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(fo, encode);
BufferedReader br = new BufferedReader(isr)) {
String line = br.readLine();
if (StringUtils.isNotBlank(line)) {
res = line;
}
}
return res;
}