Skip to content

Commit 75aa42c

Browse files
committed
- example-extracttext now outputs unicode
1 parent e3da8ad commit 75aa42c

File tree

1 file changed

+8
-10
lines changed

1 file changed

+8
-10
lines changed

example-extracttext.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,19 @@
1212
if __name__ == '__main__':
1313
try:
1414
document = opendocx(sys.argv[1])
15+
newfile = open(sys.argv[2],'w')
1516
except:
16-
print('Please supply a filename. For example:')
17-
print(''' example-extracttext.py 'My Office 2007 document.docx' ''')
17+
print('Please supply an input and output file. For example:')
18+
print(''' example-extracttext.py 'My Office 2007 document.docx' 'outputfile.txt' ''')
1819
exit()
1920
## Fetch all the text out of the document we just created
2021
paratextlist = getdocumenttext(document)
2122

22-
# Note that if using shell redirection &>, 1> 2> etc) Python tries to
23-
# change the unicode into ASCII and fails - even with a UTF-8 $LANG
24-
# As a workaround, create our own ASCII copy of the list.
25-
asciiparatextlist = []
23+
# Make explicit unicode version
24+
newparatextlist = []
2625
for paratext in paratextlist:
27-
asciiparatextlist.append(paratext.encode("ascii", "backslashreplace"))
26+
newparatextlist.append(paratext.encode("utf-8"))
2827

2928
## Print our documnts test with two newlines under each paragraph
30-
print '\n\n'.join(paratextlist)
31-
32-
29+
newfile.write('\n\n'.join(newparatextlist))
30+
#print '\n\n'.join(newparatextlist)

0 commit comments

Comments
 (0)