Skip to content

Commit

Permalink
updated the no links version to use lynx as well
Browse files Browse the repository at this point in the history
  • Loading branch information
uriel1998 committed Oct 5, 2024
1 parent f219095 commit bc8131a
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 11 deletions.
55 changes: 45 additions & 10 deletions renderer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,62 @@
##############################################################################


## may consider having it output as gfm and then having rich do it with -m --hyperlinks

#!/bin/sh

##############################################################################
#
# My renderer for newsboat/newsbeuter/mutt
# (c) Steven Saus 2024
# Licensed under the MIT license
#
##############################################################################

# For reference:

# using pup to clean it up, then to select divs that are supposed to be hidden and storing them in a variable.
# then taking the input, and removing those divs (yay grep)
# removing empty divs
# removing images (seriously, it's a MESS otherwise -- perhaps show those links at the bottom later?
# changing em and strong tags to a UTF character for highlighting regex matches in neomutt/newsbeuter. (It doesn't work multiline, but hey)
# (also, regular markdown characters don't work, so I don't forget AGAIN and try AGAIN)
# adding a br after each table row so it doesn't become collapsed and we have SOME whitespace
# considering -nonumbers to clean up the body text, but...

if [ $# -eq 0 ]; then
# no arguments passed, use stdin
input=$(cat)
echo "${input}" | sed -e 's/<img[^>]*>//g' | sed -e 's/<div[^>]*>//g' | sed -e 's/<!-- -->//g'| sed -e 's/<em[^>]*>/⬞/g' | sed -e 's/<\/em>/⬞/g' | sed -e 's/<strong[^>]*>/⬞/g' | sed -e 's/<\/strong>/⬞/g' | hxclean | hxnormalize -e -L -s 2>/dev/null | hxunent | iconv -t utf-8//TRANSLIT - | elinks -dump -no-references -no-numbering -dump-charset UTF-8 -dump-width 140 | grep -v "READ MORE:"
# echo "<html><body>" > /home/steven/reference_article.txt
# echo $input | sed -e 's/<img[^>]*>//g' | sed -e 's/<div[^>]*>//g' | sed -e 's/<!-- -->//g' | hxunent | iconv -t utf-8//TRANSLIT - >> /home/steven/reference_article.txt
# echo "</body></html>" >> /home/steven/reference_article.txt
else
antimatch=""
antimatch=$(echo "${input}" | pup 'div[style*="display: none;"],div[style*="display:none;"], div[style*="visibility: hidden;"], div[style*="overflow: hidden;"]')
if [ "$antimatch" != "" ];then
echo " "
echo "${input}" | pup | grep -vF "${antimatch}" | sed -e 's/<div[^>]*>//g' | sed 's/<img[^>]\+>//g' | sed -e 's/<!-- -->//g'| sed -e 's/<em[^>]*>/⬞/g' | sed -e 's/<\/em>/⬞/g' | sed -e 's/<strong[^>]*>/⬞/g' | sed -e 's/<\/strong>/⬞/g' | sed -e 's/<\/tr>/<\/tr><br \/>/g'| hxclean | hxnormalize -e -L -s 2>/dev/null | hxunent | lynx -dump -stdin -assume_charset=UTF-8 -force_empty_hrefless_a -hiddenlinks=ignore -html5_charsets -dont_wrap_pre -nolist -width=140 -collapse_br_tags | grep -v "READ MORE:"
else
echo " "
echo "${input}" | pup | sed -e 's/<div[^>]*>//g' | sed 's/<img[^>]\+>//g' | sed -e 's/<!-- -->//g'| sed -e 's/<em[^>]*>/⬞/g' | sed -e 's/<\/em>/⬞/g' | sed -e 's/<strong[^>]*>/⬞/g' | sed -e 's/<\/strong>/⬞/g' | sed -e 's/<\/tr>/<\/tr><br \/>/g'| hxclean | hxnormalize -e -L -s 2>/dev/null | hxunent | lynx -dump -stdin -assume_charset=UTF-8 -force_empty_hrefless_a -hiddenlinks=ignore -html5_charsets -dont_wrap_pre -nolist -width=140 -collapse_br_tags | grep -v "READ MORE:"
fi
else
# it's a URL, pass to elinks directly
if [ $(echo "${1}" | grep -c http) -gt 0 ];then
elinks "${1}" -dump -no-references -no-numbering -dump-charset UTF-8 -dump-width 140
elinks "${1}" -dump -dump-charset UTF-8 -dump-width 140
exit
fi
# it's a file, parse it this way
if [ -f "${1}" ];then
input=$(cat "${1}")
echo $input | sed -e 's/<img[^>]*>//g' | sed -e 's/<div[^>]*>//g' | hxclean | hxnormalize -e -L -s 2>/dev/null | tidy -quiet -omit -clean 2>/dev/null | hxunent | iconv -t utf-8//TRANSLIT - | elinks -dump -no-references -no-numbering -dump-charset UTF-8 -dump-width 140
# this is to deal with stupid hidden divs in HTML emails
# should only invoke when it's there; otherwise it craps out the whole thing.
echo "${input}" > /home/steven/tmp/shit.txt
antimatch=""
antimatch=$(echo "${input}" | pup 'div[style*="display: none;"],div[style*="display:none;"],div[style*="visibility: hidden;"],div[style*="overflow: hidden;"]')
if [ "$antimatch" != "" ];then
echo " "
echo "${input}" | pup | grep -vF "${antimatch}" | sed -e 's/<div[^>]*>//g' | sed 's/<img[^>]\+>//g' | sed -e 's/<!-- -->//g'| sed -e 's/<em[^>]*>/⬞/g' | sed -e 's/<\/em>/⬞/g' | sed -e 's/<strong[^>]*>/⬞/g' | sed -e 's/<\/strong>/⬞/g' | sed -e 's/<\/tr>/<\/tr><br \/>/g' | hxclean | hxnormalize -e -L -s 2>/dev/null | hxunent | lynx -dump -stdin -assume_charset=UTF-8 -force_empty_hrefless_a -underscore -hiddenlinks=ignore -html5_charsets -dont_wrap_pre -nolist -width=140 | grep -v "READ MORE:"
else
echo " "
echo "${input}" | pup | sed -e 's/<div[^>]*>//g' | sed 's/<img[^>]\+>//g' | sed -e 's/<!-- -->//g'| sed -e 's/<em[^>]*>/⬞/g' | sed -e 's/<\/em>/⬞/g' | sed -e 's/<strong[^>]*>/⬞/g' | sed -e 's/<\/strong>/⬞/g' | sed -e 's/<\/tr>/<\/tr><br \/>/g'| hxclean | hxnormalize -e -L -s 2>/dev/null | hxunent | lynx -dump -stdin -assume_charset=UTF-8 -force_empty_hrefless_a -hiddenlinks=ignore -html5_charsets -dont_wrap_pre -nolist -width=140 -collapse_br_tags | grep -v "READ MORE:"
fi
fi

fi



1 change: 0 additions & 1 deletion renderer_links.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ if [ $# -eq 0 ]; then
echo "${input}" | pup | sed -e 's/<div[^>]*>//g' | sed 's/<img[^>]\+>//g' | sed -e 's/<!-- -->//g'| sed -e 's/<em[^>]*>/⬞/g' | sed -e 's/<\/em>/⬞/g' | sed -e 's/<strong[^>]*>/⬞/g' | sed -e 's/<\/strong>/⬞/g' | sed -e 's/<\/tr>/<\/tr><br \/>/g'| hxclean | hxnormalize -e -L -s 2>/dev/null | hxunent | lynx -dump -stdin -assume_charset=UTF-8 -force_empty_hrefless_a -hiddenlinks=ignore -html5_charsets -dont_wrap_pre -width=140 -collapse_br_tags | grep -v "READ MORE:"
fi
fi

fi


0 comments on commit bc8131a

Please sign in to comment.