text_util.cfm

/*
    Description: A collection of common utilities to clean up user entered text

    This is provided as-is under a basic Creative Commons license.
    No warranty is made or implied as to the effectiveness or applicability of this code to any use.

    Original Source: https://github.com/doover/cf_text_util
*/

/** stripWord
 *
 * given a string, replace and strip many of the freaky MS Word codes.
 *
 * This function is used mostly by the cleanText() function, but is provided
 * as a separate function for optional use during saves
 * This function converts weird MS Word to 'normal' text, then eliminates
 * anything else weird left over (outside of English ASCII range)
 *
 * 	Codes stripped:
 * 				ANSII 8220 - #chr(8220)# - left quotes with "
 * 				ANSII 8221 - #chr(8221)# - right quotes with "
 * 				ANSII 8216 - #chr(8216)# - left quote with '
 * 				ANSII 8217 - #chr(8217)# - right quote with '
 * 				ANSII 8211 - #chr(8211)# - en dash with -
 * 				ANSII 8212 - #chr(8212)# - em dash with -
 * 				ANSII 8226 - #chr(8226)# - bullet with *
 * 				ANSII 8230 - #chr(8230)# - ellipsis with ...
 *
 * @text	string to strip
 * @returns	String stripped of all freak MSWord escape characters
 */
string function stripWord(required string text) {
    var returnValue = trim(arguments.text);

    //
    returnValue = replace(returnValue, chr(8220), '"', 'all'); // left quotes
    returnValue = replace(returnValue, chr(8221), '"', 'all'); // right quotes
    returnValue = replace(returnValue, chr(8216), '''', 'all'); // left '
    returnValue = replace(returnValue, chr(8217), '''', 'all'); // right '
    returnValue = replace(returnValue, chr(8211), '-', 'all'); // en dash
    returnValue = replace(returnValue, chr(8212), '-', 'all'); // em dash
    returnValue = replace(returnValue, chr(8226), '*', 'all'); // bullet
    returnValue = replace(returnValue, chr(8230), '...', 'all'); // ellipsis

    // now strip everything outside of "normal ASCII" range
    returnValue = reReplace(returnValue, '[^\x00-\x7F]', '', 'all'); // all non ASCII 0 - 128

    return trim(returnValue);
}; // stripWord


/**
 * cleanText
 *
 * Clean and format a string for display on the page.
 *
 *  First runs stripWord() to remove MS Word characters.
 *
 *  Then optionally trim the string to maxLength characters. If the string is trimmed,
 *  append &amp;hellip;' &hellip;' to the end.
 *
 *  Then run the CF function <i>encodeForHTML(string)</i> to remove HTML and other special
 *  characters and replace them with their escaped values
 *
 * 	After the encodeForHTML(), the string will contian only screen-ready clean text and
 *  escaped special characters.
 *
 *  At this point, we want to go back to the string and replace some of the escaped HTML
 *  characters and	replace them with real HTML	to allow the user to have some formatting
 *  options.
 *
 *	Replace escaped p, strong, em, u, s, sup, sub, blockquote, ol, ul, and li with their html
 *	equivalents
 *
 * If links_ok is set, replace escaped links.
 *
 * If the string was trimmed earlier, append &amp;hellip;' &hellip;' to the end.
 *
 * 	@text		String to clean
 *  @maxLength	If set, trim to this maxlength size and append '...'
 *  @links_ok   If true, links will not be escaped - BE VERY CAREFUL WITH THIS ON USER GENERATE CONTENT!!!
 *  @returns	Encoded and re-coded string
 */
string function cleanText(required string text, numeric maxLength = 0, boolean links_ok = false) {
    var returnValue = trim(arguments.text);
    var dotrimlength = false;

    // strip out stoopid word characters for ", ' and others
    returnValue = stripWord(returnValue);

    // trim string before further processing. make note we need to add ... later
    if (
        arguments.maxLength GT 0
        AND len(returnValue) GT arguments.maxLength
    ) {
        returnValue = left(returnValue, arguments.maxLength - 1);
        dotrimlength = true;
    }

    // now "clean" the string
    returnValue = encodeForHTML(returnValue, true); // encode

    // start decoding select strings
    returnValue = reReplace(returnValue, '&##x9;', '    ', 'all'); // replace escaped tabs
    returnValue = reReplace(
        returnValue,
        '&amp;nbsp&##x3b;',
        '&nbsp;',
        'all'
    ); // replace the escaped NBSP with unescaped NBSP

    // replace character returns (&##xd;&##xa; = \r\n) with BRs
    returnValue = reReplace(
        returnValue,
        '(&##xd;&##xa;|&##xa;)',
        '<br />
',
        'all'
    ); // replace character returns

    // We replaced \n above, so remove any dangling escaped BRs
    returnValue = reReplaceNoCase(
        returnValue,
        '&lt;br ?&##x2f;&gt;',
        '',
        'all'
    );

    // Replace tags escaped with unescaped versions of the tag
    // whitelist tags
    returnValue = findAndReplaceEscapedTag(returnValue, 'p'); // paragraph
    returnValue = findAndReplaceEscapedTag(returnValue, 'strong'); // bolding
    returnValue = findAndReplaceEscapedTag(returnValue, 'em'); // italics
    returnValue = findAndReplaceEscapedTag(returnValue, 'u'); // underline
    returnValue = findAndReplaceEscapedTag(returnValue, 's'); // strikethru
    returnValue = findAndReplaceEscapedTag(returnValue, 'sup'); // superscript
    returnValue = findAndReplaceEscapedTag(returnValue, 'sub'); // subscript
    returnValue = findAndReplaceEscapedTag(returnValue, 'blockquote'); // blockquote
    returnValue = findAndReplaceEscapedTag(returnValue, 'ol'); // OL
    returnValue = findAndReplaceEscapedTag(returnValue, 'ul'); // UL
    returnValue = findAndReplaceEscapedTag(returnValue, 'li'); // LI

    // clean up weird cases
    /* 	The earlier \n -> <br /> creates weird effects
    In most text blocks, after a block element (div, UL, OL, LI), the
    text will include a line feed.

    But the \n -> <br /> will cause a BR to be put after a block element
    causing a usually unwanted extra line-feed.

    So we need to go back and remove spurious BRs after block elements.

    code hint: XX> so it matches <xx> and </xx>
    */
    returnValue = reReplace(returnValue, 'ul><br />', 'ul>', 'all');
    returnValue = reReplace(returnValue, 'ol><br />', 'ol>', 'all');
    returnValue = reReplace(returnValue, 'li><br />', 'li>', 'all');
    returnValue = reReplace(
        returnValue,
        'blockquote><br />',
        'blockquote>',
        'all'
    );

    // do we need to unescape links?
    if (arguments.links_ok) {
        /*
        first decanonicalize any URLs
        Find anything that starts with http(s) and decanonicalize it
        Be careful to not be greedy, stop at " or '
        */
        returnValue = reReplaceNoCase(
            returnValue,
            'https?&##x3a;&##x2f;&##x2f;(.*?(?=(&quot;|&lt;|&##x27;|\s|$)))',
            function(transform, position, original, count) {
                return deCanonicalize(transform.matches);
            },
            'all'
        );

        /*
        find everything starting with <a href= and ending with </a>
        be careful to not be greedy so it only gets the actual <a....>...</a> and nothing more

        start with searching for strings that start with <a href
        then find anything EXCEPT the string "&gt;" followed by the string "&gt;"
        then find a string with anything EXCEPT the string "&lt;"
        followed by </a>
        */
        returnValue = reReplaceNoCase(
            returnValue,
            '&lt;a href(.*?(?=&gt;))&gt;(.*?(?=&lt;))&lt;&##x2f;a&gt;',
            function(transform, position, original, count) {
                return deCanonicalize(transform.matches);
            },
            'all'
        );

        /*
        Now look for "bare" URLs. The URL must either start the string or have a space before it (to not match earlier <a href="" above)
        if found, replace with a formatted URL
        (make sure to grab the preceeding character and put it back)
        */
        returnValue = reReplaceNoCase(
            returnValue,
            '(^|\s)(https?://[^\s<]+)',
            '\1<a href="\2">\2</a>',
            'all'
        );
    }; // links ok?

    // if we trimmed earlier, add the trailing ...
    if (dotrimlength) {
        returnValue = returnValue & '&hellip;';
    }

    return returnValue;
}; // cleanText


/**
 *  findAndReplaceTags
 *
 *	private function used by cleanText
 *  Given an HTML tag, find and replace all escaped versions of the opening and closing elements of the tag
 *
 * @testString   String to fix
 * @testElement  element to look for in testString
 * @CFLintIgnore VAR_IS_TEMPORARY,ARGUMENT_IS_TEMPORARY
 */
private string function findAndReplaceEscapedTag(required string testString, required string testElement) {
    // find opening tags
    arguments.testString = reReplaceNoCase(
        arguments.testString,
        '&lt;#arguments.testElement#&gt;',
        '<#arguments.testElement#>',
        'all'
    );
    // find closing tags  &##x2f; = /
    arguments.testString = reReplaceNoCase(
        arguments.testString,
        '&lt;&##x2f;#arguments.testElement#&gt;',
        '</#arguments.testElement#>',
        'all'
    );
    // make sure all opens are FINALIZED
    // NOTE, looking for <tag (match either followed by space or closing angle so "s" does not match "strong" - to capture all variations
    var testMatch = arrayLen(reMatch('<#arguments.testElement#[ >]', arguments.testString)) - arrayLen(
        reMatch('</#arguments.testElement#>', arguments.testString)
    );
    if (testMatch GT 0) {
        // if missmatch, add more closing elements
        arguments.testString = arguments.testString & repeatString('</#arguments.testElement#>', testMatch);
    }
    return arguments.testString;
}; // private findAndReplaceEscapedTag

/**
 * given a text string de-canonicalize it to some extent.
 * focused on cleaning up <a href="URL">text</a> type situations
 *
 * @text text to work on
 */
private string function deCanonicalize(required string text) {
    var working = arguments.text;

    working = replace(working, '&lt;', '<', 'all'); // replace <
    working = replace(working, '&gt;', '>', 'all'); // replace >
    working = replace(working, '&quot;', '"', 'all'); // replace "
    working = replace(working, '&##x27;', '''', 'all'); // replace '
    working = replace(working, '&##x2f;', '/', 'all'); // replace slashes
    working = replace(working, '&##x3a;', ':', 'all'); // replace colon
    working = replace(working, '&##x3d;', '=', 'all'); // replace =
    working = replace(working, '&##x3f;', '?', 'all'); // replace ?

    return working;
}; // deCanonicalize