-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstriprtf.py
143 lines (134 loc) · 6.44 KB
/
striprtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
"""
FROM: https://gist.github.com/miku/e714a94cd72de3d6bb3dd9efc4662431
From: http://stackoverflow.com/questions/188545/regular-expression-for-extracting-text-from-an-rtf-string
$ python striprtf.py fixtures/field1.rtf



386r-389v: , 

(386r) Inc. ...
$ unrtf --text fixtures/field1.rtf
b;;;;;;;;;;;;;;;;;;


386r-389v: Grammatikalische Quaestio, 

(386r) Inc. QVERITVR VTRUM ADIECTIVVM POSSET SVBstantiuari [?]. Et arguitur primo, quod ...
"""
import fileinput
import re
from six import unichr
def striprtf(text):
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# control words which specify a "destionation".
destinations = frozenset((
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
'listoverridetable','listpicture','liststylename','listtable','listtext',
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
# Translation of some special characters.
specialchars = {
'par': '\n',
'sect': '\n\n',
'page': '\n\n',
'line': '\n',
'tab': '\t',
'emdash': u'\u2014',
'endash': u'\u2013',
'emspace': u'\u2003',
'enspace': u'\u2002',
'qmspace': u'\u2005',
'bullet': u'\u2022',
'lquote': u'\u2018',
'rquote': u'\u2019',
'ldblquote': u'\201C',
'rdblquote': u'\u201D',
}
stack = []
ignorable = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
out = [] # Output buffer.
for match in pattern.finditer(text):
word,arg,hex,char,brace,tchar = match.groups()
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip,ignorable))
elif brace == '}':
# Pop state
ucskip,ignorable = stack.pop()
elif char: # \x (not a letter)
curskip = 0
if char == '~':
if not ignorable:
out.append(u'\xA0')
elif char in '{}\\':
if not ignorable:
out.append(char)
elif char == '*':
ignorable = True
elif word: # \foo
curskip = 0
if word in destinations:
ignorable = True
elif ignorable:
pass
elif word in specialchars:
out.append(specialchars[word])
elif word == 'uc':
ucskip = int(arg)
elif word == 'u':
c = int(arg)
if c < 0: c += 0x10000
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
curskip = ucskip
elif hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex,16)
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return ''.join(out)
if __name__ == '__main__':
cleaned = striprtf(''.join((line.decode('utf-8') for line in fileinput.input())))
print(cleaned.encode('utf-8'))