Skip to content

Commit 786a850

Browse files
committed
read large excel file using xml StAX api
1 parent 9ca1c06 commit 786a850

File tree

10 files changed

+325
-57
lines changed

10 files changed

+325
-57
lines changed

common/src/main/java/com/robin/comm/util/xls/ExcelBaseOper.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,10 @@ private static Cell createCell(Row row, int column, CellStyle cellStyle, Creatio
383383

384384
private static Cell createFormulaCell(Row row, int column, CellStyle cellStyle, String formula) {
385385
Cell cell = row.createCell(column);
386+
FormulaEvaluator evaluator = cell.getSheet().getWorkbook().getCreationHelper().createFormulaEvaluator();
386387
cell.setCellFormula(formula);
387388
cell.setCellStyle(cellStyle);
389+
cell.setCellValue(evaluator.evaluate(cell).getNumberValue());
388390
return cell;
389391
}
390392

common/src/main/java/com/robin/core/fileaccess/iterator/AbstractFileIterator.java

Lines changed: 42 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -114,51 +114,52 @@ public void beforeProcess() {
114114
}else{
115115
this.instream=accessUtil.getInResourceByStream(ResourceUtil.getProcessPath(colmeta.getPath()));
116116
}
117-
if(useOrderBy || useGroupBy){
118-
//pool all record through OffHeap
119-
//ByteBuffer buffer=ByteBuffer.allocate(512);
120-
pullNext();
121-
StringBuilder builder=new StringBuilder();
122-
while (!CollectionUtils.isEmpty(cachedValue)){
123-
while (!CollectionUtils.isEmpty(cachedValue) && useFilter && !CommRecordGenerator.doesRecordAcceptable(segment, cachedValue)) {
124-
pullNext();
125-
}
126-
if (segment != null && (!segment.isIncludeAllOriginColumn() && !CollectionUtils.isEmpty(segment.getSelectColumns()))) {
127-
newRecord.clear();
128-
CommRecordGenerator.doAsyncCalculator(segment, cachedValue, newRecord);
117+
} catch (Exception ex) {
118+
logger.error("{}", ex.getMessage());
119+
}
120+
}
121+
protected void groupOrderByInit() throws Exception{
122+
if(useOrderBy || useGroupBy){
123+
//pool all record through OffHeap
124+
//ByteBuffer buffer=ByteBuffer.allocate(512);
125+
pullNext();
126+
StringBuilder builder=new StringBuilder();
127+
while (!CollectionUtils.isEmpty(cachedValue)){
128+
while (!CollectionUtils.isEmpty(cachedValue) && useFilter && !CommRecordGenerator.doesRecordAcceptable(segment, cachedValue)) {
129+
pullNext();
130+
}
131+
if (segment != null && (!segment.isIncludeAllOriginColumn() && !CollectionUtils.isEmpty(segment.getSelectColumns()))) {
132+
newRecord.clear();
133+
CommRecordGenerator.doAsyncCalculator(segment, cachedValue, newRecord);
134+
}
135+
//get group by column
136+
if(!CollectionUtils.isEmpty(segment.getGroupBy())){
137+
if(builder.length()>0){
138+
builder.delete(0,builder.length());
129139
}
130-
//get group by column
131-
if(!CollectionUtils.isEmpty(segment.getGroupBy())){
132-
if(builder.length()>0){
133-
builder.delete(0,builder.length());
134-
}
135-
for(SqlNode tnode:segment.getGroupBy()) {
136-
String columnName=((SqlIdentifier)tnode).getSimple();
137-
if (!ObjectUtils.isEmpty(newRecord.get(columnName))) {
138-
appendByType(builder,newRecord.get(columnName));
139-
}
140+
for(SqlNode tnode:segment.getGroupBy()) {
141+
String columnName=((SqlIdentifier)tnode).getSimple();
142+
if (!ObjectUtils.isEmpty(newRecord.get(columnName))) {
143+
appendByType(builder,newRecord.get(columnName));
140144
}
141-
doGroupAgg(builder.toString());//ByteBufferUtils.getContent(buffer)
142145
}
143-
pullNext();
146+
doGroupAgg(builder.toString());//ByteBufferUtils.getContent(buffer)
144147
}
145-
//calculate avg
146-
for(CommSqlParser.ValueParts parts:segment.getSelectColumns()){
147-
if("avg".equalsIgnoreCase(parts.getFunctionName())){
148-
groupByMap.entrySet().forEach(entry->{
149-
if(!ObjectUtils.isEmpty(entry.getValue().get(parts.getAliasName())) &&
150-
!ObjectUtils.isEmpty(entry.getValue().get(parts.getAliasName()+"cou"))){
151-
entry.getValue().put(parts.getAliasName(),(Double)entry.getValue().get(parts.getAliasName())/(Integer)entry.getValue().get(parts.getAliasName()+"cou"));
152-
entry.getValue().remove(parts.getAliasName()+"cou");
153-
}
154-
});
155-
}
148+
pullNext();
149+
}
150+
//calculate avg
151+
for(CommSqlParser.ValueParts parts:segment.getSelectColumns()){
152+
if("avg".equalsIgnoreCase(parts.getFunctionName())){
153+
groupByMap.entrySet().forEach(entry->{
154+
if(!ObjectUtils.isEmpty(entry.getValue().get(parts.getAliasName())) &&
155+
!ObjectUtils.isEmpty(entry.getValue().get(parts.getAliasName()+"cou"))){
156+
entry.getValue().put(parts.getAliasName(),(Double)entry.getValue().get(parts.getAliasName())/(Integer)entry.getValue().get(parts.getAliasName()+"cou"));
157+
entry.getValue().remove(parts.getAliasName()+"cou");
158+
}
159+
});
156160
}
157-
groupIter=groupByMap.entrySet().iterator();
158161
}
159-
160-
} catch (Exception ex) {
161-
logger.error("{}", ex.getMessage());
162+
groupIter=groupByMap.entrySet().iterator();
162163
}
163164
}
164165
private void doGroupAgg(String key){
@@ -254,6 +255,7 @@ public boolean hasNext() {
254255
try {
255256
// no order by
256257
if(!useOrderBy && !useGroupBy) {
258+
groupOrderByInit();
257259
pullNext();
258260
while (!CollectionUtils.isEmpty(cachedValue) && useFilter && !CommRecordGenerator.doesRecordAcceptable(segment, cachedValue)) {
259261
pullNext();
@@ -312,7 +314,7 @@ private String getHavingColumnName(){
312314
return aliasName;
313315
}
314316

315-
protected abstract void pullNext();
317+
protected abstract void pullNext() throws Exception;
316318
public DataCollectionMeta getCollectionMeta(){
317319
return colmeta;
318320
}
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
package com.robin.core.fileaccess.iterator;
2+
3+
import com.google.common.collect.Sets;
4+
import com.robin.core.base.util.Const;
5+
import com.robin.core.convert.util.ConvertUtil;
6+
import com.robin.core.fileaccess.fs.AbstractFileSystemAccessor;
7+
import com.robin.core.fileaccess.meta.DataCollectionMeta;
8+
import com.robin.core.fileaccess.meta.DataSetColumnMeta;
9+
import com.robin.core.fileaccess.util.PolandNotationUtil;
10+
import org.apache.poi.openxml4j.opc.OPCPackage;
11+
import org.apache.poi.ss.usermodel.DateUtil;
12+
import org.apache.poi.xssf.eventusermodel.XSSFReader;
13+
import org.apache.poi.xssf.model.SharedStrings;
14+
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
15+
import org.springframework.util.CollectionUtils;
16+
import org.springframework.util.ObjectUtils;
17+
18+
import javax.xml.stream.XMLInputFactory;
19+
import javax.xml.stream.XMLStreamConstants;
20+
import javax.xml.stream.XMLStreamReader;
21+
import java.io.IOException;
22+
import java.io.InputStream;
23+
import java.sql.Timestamp;
24+
import java.util.*;
25+
/**
26+
* read xlsx with StAX api
27+
*/
28+
public class XlsxFileIterator extends AbstractFileIterator{
29+
private XMLInputFactory factory;
30+
private XMLStreamReader streamReader;
31+
private OPCPackage opcPackage;
32+
private Iterator<InputStream> sheetStreams;
33+
private InputStream readStreams;
34+
private XSSFReader xssfReader;
35+
private SharedStrings sharedStrings;
36+
private static final char charA='A';
37+
private static boolean hasHeader=true;
38+
private Map<String,String> formulaStrMap=new HashMap<>();
39+
private Map<String, Queue<String>> formulaMap=new HashMap<>();
40+
private static final Set<Character> formulaSets= Sets.newHashSet('(',')','+','-','*','/');
41+
42+
public XlsxFileIterator(){
43+
identifier= Const.FILEFORMATSTR.XLSX.getValue();
44+
}
45+
46+
public XlsxFileIterator(DataCollectionMeta meta) {
47+
super(meta);
48+
identifier= Const.FILEFORMATSTR.XLSX.getValue();
49+
}
50+
public XlsxFileIterator(DataCollectionMeta meta, AbstractFileSystemAccessor accessor) {
51+
super(meta,accessor);
52+
identifier= Const.FILEFORMATSTR.XLSX.getValue();
53+
if(meta.getResourceCfgMap().containsKey(Const.COLUMN_XLSX_HASHEADER) && Const.FALSE.equalsIgnoreCase(meta.getResourceCfgMap().get(Const.COLUMN_XLSX_HASHEADER).toString())){
54+
hasHeader=false;
55+
}
56+
}
57+
58+
@Override
59+
public void beforeProcess() {
60+
super.beforeProcess();
61+
try {
62+
factory = XMLInputFactory.newFactory();
63+
factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
64+
factory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
65+
opcPackage=OPCPackage.open(instream);
66+
xssfReader=new XSSFReader(opcPackage);
67+
sheetStreams=xssfReader.getSheetsData();
68+
sharedStrings=xssfReader.getSharedStringsTable();
69+
if(sheetStreams.hasNext()){
70+
readStreams=sheetStreams.next();
71+
streamReader=factory.createXMLStreamReader(readStreams,colmeta.getEncode());
72+
String nodeName;
73+
while(streamReader.hasNext()){
74+
if(streamReader.getEventType()== XMLStreamConstants.START_ELEMENT) {
75+
nodeName = streamReader.getLocalName();
76+
if("row".equalsIgnoreCase(nodeName)){
77+
break;
78+
}
79+
}
80+
streamReader.next();
81+
}
82+
}
83+
if(hasHeader){
84+
readNext();
85+
}
86+
}catch (Exception ex){
87+
88+
}
89+
}
90+
private void readNext() throws Exception{
91+
String nodeName;
92+
int rowNum;
93+
String rowNumStr;
94+
int columnPos=0;
95+
String value;
96+
boolean sharedValue=false;
97+
boolean breakTag=false;
98+
Object targetValue=null;
99+
100+
101+
while (streamReader.hasNext() && !breakTag){
102+
if(streamReader.getEventType()== XMLStreamConstants.START_ELEMENT){
103+
nodeName=streamReader.getLocalName();
104+
switch (nodeName.toLowerCase()){
105+
case "row":
106+
rowNumStr = streamReader.getAttributeValue("", "r");
107+
rowNum = Integer.parseInt(rowNumStr);
108+
break;
109+
case "c":
110+
columnPos=getColumnPos(streamReader.getAttributeValue("","r"))-1;
111+
String cellType=streamReader.getAttributeValue("","t");
112+
if("s".equals(cellType)){
113+
sharedValue=true;
114+
}
115+
break;
116+
case "v":
117+
streamReader.next();
118+
value=streamReader.getText();
119+
if(sharedValue){
120+
targetValue=new XSSFRichTextString(sharedStrings.getItemAt(Integer.parseInt(value)).getString()).toString();
121+
sharedValue=false;
122+
}else{
123+
targetValue=parseValue(value, colmeta.getColumnList().get(columnPos));
124+
}
125+
break;
126+
case "t":
127+
streamReader.next();
128+
targetValue=streamReader.getText();
129+
break;
130+
case "f":
131+
streamReader.next();
132+
if(streamReader.getEventType()==XMLStreamConstants.CHARACTERS) {
133+
value = streamReader.getText();
134+
135+
}
136+
break;
137+
}
138+
139+
}else if(streamReader.getEventType()==XMLStreamConstants.END_ELEMENT){
140+
nodeName=streamReader.getLocalName();
141+
switch (nodeName.toLowerCase()) {
142+
case "row":
143+
breakTag=true;
144+
break;
145+
case "t":
146+
case "v":
147+
cachedValue.put(colmeta.getColumnList().get(columnPos).getColumnName(),targetValue);
148+
break;
149+
case "f":
150+
/*if(!formulaStrMap.containsKey(colmeta.getColumnList().get(columnPos).getColumnName()) || !formulaStrMap.get(colmeta.getColumnList().get(columnPos).getColumnName()).equals(value)){
151+
formulaStrMap.put(colmeta.getColumnList().get(columnPos).getColumnName(),value);
152+
Queue<String> queue= PolandNotationUtil.parsePre(getFormula(value));
153+
if(!CollectionUtils.isEmpty(queue)){
154+
formulaMap.put(colmeta.getColumnList().get(columnPos).getColumnName(),queue);
155+
}
156+
}*/
157+
break;
158+
159+
}
160+
}
161+
streamReader.next();
162+
}
163+
}
164+
private Object parseValue(String value, DataSetColumnMeta columnMeta){
165+
if(!columnMeta.getColumnType().equals(Const.META_TYPE_TIMESTAMP)){
166+
return ConvertUtil.convertStringToTargetObject(value, columnMeta, formatter);
167+
}else{
168+
double dValue=Double.valueOf(value);
169+
Date date = DateUtil.getJavaDate(dValue);
170+
return new Timestamp(date.getTime());
171+
}
172+
}
173+
private static int getColumnPos(String columnName){
174+
StringBuilder builder=new StringBuilder();
175+
for(int i=0;i<columnName.length();i++){
176+
if(!Character.isAlphabetic(columnName.charAt(i))){
177+
builder.append(columnName.substring(0,i));
178+
break;
179+
}
180+
}
181+
int columnPos=0;
182+
if(builder.length()>0){
183+
for(int i=0;i<builder.length();i++){
184+
int val=(builder.charAt(i)-charA)+1;
185+
if(i>0){
186+
columnPos=columnPos*26+val;
187+
}else{
188+
columnPos=val;
189+
}
190+
}
191+
}
192+
return columnPos;
193+
}
194+
private String getFormula(String formula){
195+
StringBuilder builder=new StringBuilder();
196+
StringBuilder outBuilder=new StringBuilder();
197+
for(int i=0;i<formula.length();i++){
198+
if(formulaSets.contains(formula.charAt(i))){
199+
if(builder.length()>0){
200+
String columnName=builder.toString();
201+
outBuilder.append(colmeta.getColumnList().get(getColumnPos(columnName)).getColumnName());
202+
builder.delete(0,builder.length());
203+
}
204+
outBuilder.append(formula.charAt(i));
205+
}else{
206+
builder.append(formula.charAt(i));
207+
}
208+
}
209+
if(formulaSets.contains(builder.charAt(0))){
210+
outBuilder.append(builder.charAt(0));
211+
}else{
212+
String columnName=builder.toString();
213+
outBuilder.append(colmeta.getColumnList().get(getColumnPos(columnName)).getColumnName());
214+
}
215+
return outBuilder.toString();
216+
}
217+
218+
@Override
219+
protected void pullNext() throws Exception {
220+
cachedValue.clear();
221+
readNext();
222+
if(CollectionUtils.isEmpty(cachedValue)){
223+
if(sheetStreams.hasNext()){
224+
readStreams.close();
225+
streamReader.close();
226+
readStreams=sheetStreams.next();
227+
streamReader=factory.createXMLStreamReader(readStreams,colmeta.getEncode());
228+
}
229+
readNext();
230+
}
231+
//calculate formula column
232+
if(!formulaMap.isEmpty()){
233+
formulaMap.forEach((k,v)->{
234+
if(ObjectUtils.isEmpty(cachedValue.get(k))){
235+
cachedValue.put(k,PolandNotationUtil.computeResult(v,cachedValue));
236+
}
237+
});
238+
}
239+
}
240+
241+
@Override
242+
public void close() throws IOException {
243+
super.close();
244+
if(readStreams!=null){
245+
readStreams.close();
246+
}
247+
if(opcPackage!=null){
248+
opcPackage.close();
249+
}
250+
}
251+
}

common/src/main/resources/META-INF/services/com.robin.core.fileaccess.iterator.IResourceIterator

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ com.robin.core.fileaccess.iterator.PlainTextFileIterator
33
com.robin.core.fileaccess.iterator.ArffFileIterator
44
com.robin.core.fileaccess.iterator.XmlFileIterator
55
com.robin.core.resaccess.iterator.JdbcResIterator
6+
com.robin.core.fileaccess.iterator.XlsxFileIterator

core/src/main/java/com/robin/core/base/util/Const.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ public class Const {
9797
public static final Integer COLUMN_INVALID = 0;
9898
public static final String ITERATOR_PROCESSID="$processId";
9999
public static final String SUCCESS="success";
100+
public static final String COLUMN_XLSX_HASHEADER="xlsx.hasHeader";
100101

101102

102103
public enum FILEFORMAT {
@@ -437,7 +438,8 @@ public enum FILEFORMATSTR {
437438
CSV("csv"),
438439
PROTOBUF("proto"),
439440
ARROW("arrow"),
440-
ARFF("arff");
441+
ARFF("arff"),
442+
XLSX("xlsx");
441443
private String value;
442444
FILEFORMATSTR(String value){
443445
this.value=value;

0 commit comments

Comments
 (0)