-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_util.cfm
277 lines (247 loc) · 10.4 KB
/
text_util.cfm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
/*
Description: A collection of common utilities to clean up user entered text
This is provided as-is under a basic Creative Commons license.
No warranty is made or implied as to the effectiveness or applicability of this code to any use.
Original Source: https://github.com/doover/cf_text_util
*/
/** stripWord
*
* given a string, replace and strip many of the freaky MS Word codes.
*
* This function is used mostly by the cleanText() function, but is provided
* as a separate function for optional use during saves
* This function converts weird MS Word to 'normal' text, then eliminates
* anything else weird left over (outside of English ASCII range)
*
* Codes stripped:
* ANSII 8220 - #chr(8220)# - left quotes with "
* ANSII 8221 - #chr(8221)# - right quotes with "
* ANSII 8216 - #chr(8216)# - left quote with '
* ANSII 8217 - #chr(8217)# - right quote with '
* ANSII 8211 - #chr(8211)# - en dash with -
* ANSII 8212 - #chr(8212)# - em dash with -
* ANSII 8226 - #chr(8226)# - bullet with *
* ANSII 8230 - #chr(8230)# - ellipsis with ...
*
* @text string to strip
* @returns String stripped of all freak MSWord escape characters
*/
string function stripWord(required string text) {
var returnValue = trim(arguments.text);
//
returnValue = replace(returnValue, chr(8220), '"', 'all'); // left quotes
returnValue = replace(returnValue, chr(8221), '"', 'all'); // right quotes
returnValue = replace(returnValue, chr(8216), '''', 'all'); // left '
returnValue = replace(returnValue, chr(8217), '''', 'all'); // right '
returnValue = replace(returnValue, chr(8211), '-', 'all'); // en dash
returnValue = replace(returnValue, chr(8212), '-', 'all'); // em dash
returnValue = replace(returnValue, chr(8226), '*', 'all'); // bullet
returnValue = replace(returnValue, chr(8230), '...', 'all'); // ellipsis
// now strip everything outside of "normal ASCII" range
returnValue = reReplace(returnValue, '[^\x00-\x7F]', '', 'all'); // all non ASCII 0 - 128
return trim(returnValue);
}; // stripWord
/**
* cleanText
*
* Clean and format a string for display on the page.
*
* First runs stripWord() to remove MS Word characters.
*
* Then optionally trim the string to maxLength characters. If the string is trimmed,
* append …' …' to the end.
*
* Then run the CF function <i>encodeForHTML(string)</i> to remove HTML and other special
* characters and replace them with their escaped values
*
* After the encodeForHTML(), the string will contian only screen-ready clean text and
* escaped special characters.
*
* At this point, we want to go back to the string and replace some of the escaped HTML
* characters and replace them with real HTML to allow the user to have some formatting
* options.
*
* Replace escaped p, strong, em, u, s, sup, sub, blockquote, ol, ul, and li with their html
* equivalents
*
* If links_ok is set, replace escaped links.
*
* If the string was trimmed earlier, append &hellip;' …' to the end.
*
* @text String to clean
* @maxLength If set, trim to this maxlength size and append '...'
* @links_ok If true, links will not be escaped - BE VERY CAREFUL WITH THIS ON USER GENERATE CONTENT!!!
* @returns Encoded and re-coded string
*/
string function cleanText(required string text, numeric maxLength = 0, boolean links_ok = false) {
var returnValue = trim(arguments.text);
var dotrimlength = false;
// strip out stoopid word characters for ", ' and others
returnValue = stripWord(returnValue);
// trim string before further processing. make note we need to add ... later
if (
arguments.maxLength GT 0
AND len(returnValue) GT arguments.maxLength
) {
returnValue = left(returnValue, arguments.maxLength - 1);
dotrimlength = true;
}
// now "clean" the string
returnValue = encodeForHTML(returnValue, true); // encode
// start decoding select strings
returnValue = reReplace(returnValue, '&##x9;', ' ', 'all'); // replace escaped tabs
returnValue = reReplace(
returnValue,
'&nbsp&##x3b;',
' ',
'all'
); // replace the escaped NBSP with unescaped NBSP
// replace character returns (&##xd;&##xa; = \r\n) with BRs
returnValue = reReplace(
returnValue,
'(&##xd;&##xa;|&##xa;)',
'<br />
',
'all'
); // replace character returns
// We replaced \n above, so remove any dangling escaped BRs
returnValue = reReplaceNoCase(
returnValue,
'<br ?&##x2f;>',
'',
'all'
);
// Replace tags escaped with unescaped versions of the tag
// whitelist tags
returnValue = findAndReplaceEscapedTag(returnValue, 'p'); // paragraph
returnValue = findAndReplaceEscapedTag(returnValue, 'strong'); // bolding
returnValue = findAndReplaceEscapedTag(returnValue, 'em'); // italics
returnValue = findAndReplaceEscapedTag(returnValue, 'u'); // underline
returnValue = findAndReplaceEscapedTag(returnValue, 's'); // strikethru
returnValue = findAndReplaceEscapedTag(returnValue, 'sup'); // superscript
returnValue = findAndReplaceEscapedTag(returnValue, 'sub'); // subscript
returnValue = findAndReplaceEscapedTag(returnValue, 'blockquote'); // blockquote
returnValue = findAndReplaceEscapedTag(returnValue, 'ol'); // OL
returnValue = findAndReplaceEscapedTag(returnValue, 'ul'); // UL
returnValue = findAndReplaceEscapedTag(returnValue, 'li'); // LI
// clean up weird cases
/* The earlier \n -> <br /> creates weird effects
In most text blocks, after a block element (div, UL, OL, LI), the
text will include a line feed.
But the \n -> <br /> will cause a BR to be put after a block element
causing a usually unwanted extra line-feed.
So we need to go back and remove spurious BRs after block elements.
code hint: XX> so it matches <xx> and </xx>
*/
returnValue = reReplace(returnValue, 'ul><br />', 'ul>', 'all');
returnValue = reReplace(returnValue, 'ol><br />', 'ol>', 'all');
returnValue = reReplace(returnValue, 'li><br />', 'li>', 'all');
returnValue = reReplace(
returnValue,
'blockquote><br />',
'blockquote>',
'all'
);
// do we need to unescape links?
if (arguments.links_ok) {
/*
first decanonicalize any URLs
Find anything that starts with http(s) and decanonicalize it
Be careful to not be greedy, stop at " or '
*/
returnValue = reReplaceNoCase(
returnValue,
'https?&##x3a;&##x2f;&##x2f;(.*?(?=("|<|&##x27;|\s|$)))',
function(transform, position, original, count) {
return deCanonicalize(transform.matches);
},
'all'
);
/*
find everything starting with <a href= and ending with </a>
be careful to not be greedy so it only gets the actual <a....>...</a> and nothing more
start with searching for strings that start with <a href
then find anything EXCEPT the string ">" followed by the string ">"
then find a string with anything EXCEPT the string "<"
followed by </a>
*/
returnValue = reReplaceNoCase(
returnValue,
'<a href(.*?(?=>))>(.*?(?=<))<&##x2f;a>',
function(transform, position, original, count) {
return deCanonicalize(transform.matches);
},
'all'
);
/*
Now look for "bare" URLs. The URL must either start the string or have a space before it (to not match earlier <a href="" above)
if found, replace with a formatted URL
(make sure to grab the preceeding character and put it back)
*/
returnValue = reReplaceNoCase(
returnValue,
'(^|\s)(https?://[^\s<]+)',
'\1<a href="\2">\2</a>',
'all'
);
}; // links ok?
// if we trimmed earlier, add the trailing ...
if (dotrimlength) {
returnValue = returnValue & '…';
}
return returnValue;
}; // cleanText
/**
* findAndReplaceTags
*
* private function used by cleanText
* Given an HTML tag, find and replace all escaped versions of the opening and closing elements of the tag
*
* @testString String to fix
* @testElement element to look for in testString
* @CFLintIgnore VAR_IS_TEMPORARY,ARGUMENT_IS_TEMPORARY
*/
private string function findAndReplaceEscapedTag(required string testString, required string testElement) {
// find opening tags
arguments.testString = reReplaceNoCase(
arguments.testString,
'<#arguments.testElement#>',
'<#arguments.testElement#>',
'all'
);
// find closing tags &##x2f; = /
arguments.testString = reReplaceNoCase(
arguments.testString,
'<&##x2f;#arguments.testElement#>',
'</#arguments.testElement#>',
'all'
);
// make sure all opens are FINALIZED
// NOTE, looking for <tag (match either followed by space or closing angle so "s" does not match "strong" - to capture all variations
var testMatch = arrayLen(reMatch('<#arguments.testElement#[ >]', arguments.testString)) - arrayLen(
reMatch('</#arguments.testElement#>', arguments.testString)
);
if (testMatch GT 0) {
// if missmatch, add more closing elements
arguments.testString = arguments.testString & repeatString('</#arguments.testElement#>', testMatch);
}
return arguments.testString;
}; // private findAndReplaceEscapedTag
/**
* given a text string de-canonicalize it to some extent.
* focused on cleaning up <a href="URL">text</a> type situations
*
* @text text to work on
*/
private string function deCanonicalize(required string text) {
var working = arguments.text;
working = replace(working, '<', '<', 'all'); // replace <
working = replace(working, '>', '>', 'all'); // replace >
working = replace(working, '"', '"', 'all'); // replace "
working = replace(working, '&##x27;', '''', 'all'); // replace '
working = replace(working, '&##x2f;', '/', 'all'); // replace slashes
working = replace(working, '&##x3a;', ':', 'all'); // replace colon
working = replace(working, '&##x3d;', '=', 'all'); // replace =
working = replace(working, '&##x3f;', '?', 'all'); // replace ?
return working;
}; // deCanonicalize