Skip to content

Commit 3216284

Browse files
authored
Merge pull request #93 from kermitt2/line-number
Line number and finalize 0.3
2 parents 9cde96a + 12a6488 commit 3216284

8 files changed

+837
-359
lines changed

.gitmodules

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
[submodule "xpdf-4.00"]
22
path = xpdf-4.00
33
url = https://github.com/kermitt2/xpdf-4.00
4-
branch = nonumericchanamesmapping

Readme.md

+29-23
Original file line numberDiff line numberDiff line change
@@ -21,29 +21,29 @@ The latest stable version is *0.2*. Working version (master) is *0.3*.
2121
General usage is as follow:
2222

2323
```
24-
pdfalto [options] <PDF-file> [<xml-file>]
25-
-f <int> : first page to convert
26-
-l <int> : last page to convert
27-
-verbose : display pdf attributes
28-
-noText : do not extract textual objects
29-
-noImage : do not extract Images (Bitmap and Vectorial)
30-
-noImageInline : do not include images inline in the stream
31-
-outline : create an outline file xml (i.e. a table of content) as additional file
32-
-annotation : create an annotations file xml as additional file
33-
-blocks : add blocks informations whithin the structure
34-
-readingOrder : blocks follow the reading order
35-
-fullFontName : fonts names are not normalized
36-
-nsURI <string> : add the specified namespace URI
37-
-opw <string> : owner password (for encrypted files)
38-
-upw <string> : user password (for encrypted files)
39-
-filesLimit <int> : limit of asset files be extracted to the value specified
40-
-q : don't print any messages or errors
41-
-v : print version info
42-
-h : print usage information
43-
-help : print usage information
44-
--help : print usage information
45-
-? : print usage information
46-
--saveconf <string> : save all command line parameters in the specified XML <file>
24+
Usage: pdfalto [options] <PDF-file> [<xml-file>]
25+
-f <int> : first page to convert
26+
-l <int> : last page to convert
27+
-verbose : display pdf attributes
28+
-noImage : do not extract Images (Bitmap and Vectorial)
29+
-noImageInline : do not include images inline in the stream
30+
-outline : create an outline file xml
31+
-annotation : create an annotations file xml
32+
-noLineNumbers : do not output line numbers added in manuscript-style textual documents
33+
-readingOrder : blocks follow the reading order
34+
-noText : do not extract textual objects (might be useful, but non-valid ALTO)
35+
-charReadingOrderAttr : include TYPE attribute to String elements to indicate right-to-left reading order (might be useful, but non-valid ALTO)
36+
-fullFontName : fonts names are not normalized
37+
-nsURI <string> : add the specified namespace URI
38+
-opw <string> : owner password (for encrypted files)
39+
-upw <string> : user password (for encrypted files)
40+
-filesLimit <int> : limit of asset files be extracted
41+
-q : don't print any messages or errors
42+
-v : print version info
43+
-h : print usage information
44+
-help : print usage information
45+
--help : print usage information
46+
-? : print usage information
4747
```
4848

4949
In addition to the [ALTO](https://github.com/altoxml/documentation/wiki) file describing the PDF content, the following files are generated:
@@ -93,6 +93,12 @@ The executable `pdfalto` is generated in the root directory. Additionally, this
9393

9494
# Changes
9595

96+
New in version 0.3 (apart various bug fixes):
97+
98+
- line number detection: line numbers (typically added for review in manuscripts/preprints) are specifically identified and not anymore mixed with the rest of text content, they will be grouped in a separate block or, optionally, not outputted in the ALTO file (`noLineNumbers` option)
99+
100+
- removal of `-blocks` option, the block information are always returned for ensuring ALTO validation (`<TextBlock>` element)
101+
96102
New in version 0.2 (apart various bug fixes):
97103

98104
- support Unicode composition of characters

install_deps.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ DEP_INSTALL_DIR=install
1818

1919
LIBXML_URI=http://xmlsoft.org/sources/libxml2-2.9.8.tar.gz
2020
FREETYPE_URI=https://download.savannah.gnu.org/releases/freetype/freetype-2.9.tar.gz
21-
ICU_URI=http://download.icu-project.org/files/icu4c/62.1/icu4c-62_1-src.tgz
21+
#ICU_URI=http://download.icu-project.org/files/icu4c/62.1/icu4c-62_1-src.tgz
22+
ICU_URI=https://github.com/unicode-org/icu/releases/download/release-62-2/icu4c-62_2-src.tgz
23+
#ICU_URI=https://github.com/unicode-org/icu/releases/download/release-66-1/icu4c-66_1-src.tgz
2224

2325
mkdir -p $DEP_INSTALL_DIR
2426

src/Parameters.cc

+8-23
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ void Parameters::setDisplayText(GBool text) {
3434
unlockGlobalParams;
3535
}
3636

37-
void Parameters::setDisplayBlocks(GBool block) {
37+
/*void Parameters::setDisplayBlocks(GBool block) {
3838
lockGlobalParams;
3939
displayBlocks = block;
4040
unlockGlobalParams;
41-
}
41+
}*/
4242

4343
void Parameters::setDisplayOutline(GBool outl) {
4444
lockGlobalParams;
@@ -83,6 +83,12 @@ void Parameters::setOcr(GBool ocrA) {
8383
unlockGlobalParams;
8484
}
8585

86+
void Parameters::setNoLineNumbers(GBool noLineNumberAttrs) {
87+
lockGlobalParams;
88+
noLineNumbers = noLineNumberAttrs;
89+
unlockGlobalParams;
90+
}
91+
8692
void Parameters::saveToXML(const char *fileName,int firstPage,int lastPage){
8793
char* tmp;
8894
tmp=(char*)malloc(10*sizeof(char));
@@ -109,27 +115,6 @@ void Parameters::saveToXML(const char *fileName,int firstPage,int lastPage){
109115
xmlAddChild(tool,version);
110116
xmlAddChild(tool,desc);
111117

112-
// * -f <int> : first page to convert<br/>
113-
// * -l <int> : last page to convert<br/>
114-
// * -verbose : display pdf attributes<br/>
115-
// * -noText : do not extract textual objects<br/>
116-
// * -noImage : do not extract images (Bitmap and Vectorial)<br/>
117-
// * -noImageInline : do not include images inline in the stream<br/>
118-
// * -outline : create an outline file xml<br/>
119-
// * -annots : create an annotaitons file xml<br/>
120-
// * -cutPages : cut all pages in separately files<br/>
121-
// * -blocks : add blocks informations whithin the structure<br/>
122-
// * -readingOrder : blocks follow the reading order<br/>
123-
// * -fullFontName : fonts names are not normalized<br/>
124-
// * -nsURI : add the specified namespace URI<br/>
125-
// * -q : don't print any messages or errors<br/>
126-
// * -v : print copyright and version information<br/>
127-
// * -h : print usage information<br/>
128-
// * -help : print usage information<br/>
129-
// * --help : print usage information<br/>
130-
// * -? : print usage information<br/>
131-
132-
133118
param = xmlNewNode(NULL,(const xmlChar*)TAG_PAR_PARAM);
134119
xmlNewProp(param,(const xmlChar*)"name",(const xmlChar*)"first page");
135120
xmlNewProp(param,(const xmlChar*)"form",(const xmlChar*)"-f");

src/Parameters.h

+19-4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class Parameters {
3131
/** Destructor */
3232
~Parameters();
3333

34+
// getters
35+
3436
/** Return a boolean which inform if the text is displayed
3537
* @return <code>true</code> if the toText option is selected, <code>false</code> otherwise
3638
*/
@@ -39,7 +41,7 @@ class Parameters {
3941
/** Return a boolean which inform if blocks informations are diplayed
4042
* @return <code>true</code> if the blocks option is selected, <code>false</code> otherwise
4143
*/
42-
GBool getDisplayBlocks() { return displayBlocks;};
44+
//GBool getDisplayBlocks() { return displayBlocks;};
4345

4446
/** Return a boolean which inform if the images are displayed
4547
* @return <code>true</code> if the noImage option is not selected, <code>false</code> otherwise
@@ -88,6 +90,13 @@ class Parameters {
8890
*/
8991
int getFilesCountLimit() {return filesCountLimit;}
9092

93+
/** Return a boolean which inform if line numbers tokens are diplayed
94+
* @return <code>true</code> if the noLineNumbers option is selected, <code>false</code> otherwise
95+
*/
96+
GBool getNoLineNumbers() { return noLineNumbers;};
97+
98+
// setters
99+
91100
/** Modify the boolean which inform if the images are displayed
92101
* @param noImage <code>true</code> if the noImage option is not selected, <code>false</code> otherwise
93102
*/
@@ -101,7 +110,7 @@ class Parameters {
101110
/** Modify the boolean which inform if blocks informations are diplayed
102111
* @param noblock <code>true</code> if the blocks option is selected, <code>false</code> otherwise
103112
*/
104-
void setDisplayBlocks(GBool noblock);
113+
//void setDisplayBlocks(GBool noblock);
105114

106115
/** Modify the boolean which inform if the bookmark is displayed
107116
* @param outline <code>true</code> if the outline option is selected, <code>false</code> otherwise
@@ -140,6 +149,11 @@ class Parameters {
140149
void setOcr(GBool ocrA);
141150

142151
void setFilesCountLimit(int count);
152+
153+
/** Modify the boolean which inform if line numbers must be diplayed
154+
* @param noLineNumberAttrs <code>true</code> if the noLineNumbers option is selected, <code>false</code> otherwise
155+
*/
156+
void setNoLineNumbers(GBool noLineNumberAttrs);
143157

144158
void saveToXML(const char *fileName,int firstPage,int lastPage);
145159

@@ -150,7 +164,7 @@ class Parameters {
150164
/** The value of the noText option */
151165
GBool displayText;
152166
/** The value of the blocks option */
153-
GBool displayBlocks;
167+
//GBool displayBlocks;
154168
/** The value of the outline option */
155169
GBool displayOutline;
156170
/** The value of the cutPages option */
@@ -167,7 +181,8 @@ class Parameters {
167181
GBool ocr;
168182
/** the count limit of files */
169183
int filesCountLimit;
170-
184+
/** PL: the value of the noLineNumbers option*/
185+
GBool noLineNumbers;
171186
};
172187

173188
#endif /*PARAMETERS_H_*/

0 commit comments

Comments
 (0)