Skip to content

Commit

Permalink
Update RSS Sources and Filters
Browse files Browse the repository at this point in the history
Added new relevant RSS sources, removed obsolete ones, and enhanced banned word filters for optimized detection of cyber trends. The 'remove_banned_words' function has been updated to effectively remove domain names, special characters, and multiple versions.
  • Loading branch information
NEZRI Ygal authored and NEZRI Ygal committed May 3, 2024
1 parent f6f8d67 commit b340d98
Show file tree
Hide file tree
Showing 4 changed files with 294 additions and 65 deletions.
17 changes: 17 additions & 0 deletions Watcher/Watcher/threats_watcher/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,23 @@ def remove_banned_words():
# Remove '/' (sometimes regular expression don't catch this character)
word = word.replace("/", "")

# Remove domain name
domain_extensions = [
".com", ".org", ".net", ".edu", ".gov", ".mil",
".biz", ".info", ".name", ".pro", ".coop", ".museum", ".aero", ".int", ".jobs", ".mobi", ".tel", ".travel",
".fr", ".uk", ".de", ".jp", ".cn", ".it", ".us", ".es", ".ca", ".au", ".nl", ".ru", ".br", ".pl", ".in", ".be", ".ch", ".se", ".mx", ".at", ".dk", ".no", ".fi", ".ie", ".nz", ".sg", ".hk", ".my", ".za", ".ar", ".tw", ".kr", ".vn", ".tr", ".ua", ".gr", ".pt", ".cz", ".hu", ".cl", ".ro", ".id", ".il", ".co", ".ae", ".th", ".sk", ".bg", ".ph", ".hr", ".lt", ".si", ".lv", ".ee", ".rs", ".is", ".ir", ".sa", ".pe", ".ma", ".by", ".gt", ".do", ".ng", ".cr", ".ve", ".ec", ".py", ".sv", ".hn", ".pa", ".bo", ".kz", ".lu", ".uy", ".dz", ".uz", ".ke", ".np", ".kh", ".zm", ".ug", ".cy", ".mm", ".et", ".ni", ".al", ".kg", ".bd", ".tn", ".np", ".la", ".gh", ".iq", ".bj", ".gm", ".tg", ".lk", ".jo", ".zw", ".sn", ".km", ".mw", ".md", ".mr", ".tn", ".bf", ".bi", ".sc", ".er", ".sl", ".cf", ".ss", ".td", ".cg", ".gq", ".dj", ".rw", ".so", ".ne", ".yt", ".re", ".pm", ".wf", ".tf", ".gs", ".ai", ".aw", ".bb", ".bm", ".vg", ".ky", ".fk", ".fo", ".gl", ".gp", ".gg", ".gi", ".je", ".im", ".mq", ".ms", ".nc", ".pf", ".pn", ".sh", ".sb", ".gs", ".tc", ".tk", ".vg", ".vi", ".um", ".cx", ".cc", ".ac", ".eu", ".ad", ".ax", ".gg", ".gi", ".im", ".je", ".mc", ".me", ".sm", ".va", ".rs", ".ps", ".asia", ".cat", ".coop", ".jobs", ".mobi", ".tel", ".travel" # Domaines de premier niveau géographiques (ccTLD)
]
if any(word.endswith(ext) for ext in domain_extensions):
word = ""

# Remove special characters
word = word.encode("latin1", errors="ignore").decode("utf-8", errors="ignore")

# Remove version numbers in the format x.x.x
word = re.sub(r"\b\d+(?:\.\d+){2,}\b", "", word)
# Remove version numbers in the format vx.x.x
word = re.sub(r"v\d+(?:\.\d+){2,}", "", word)

if word:
posts_without_banned[word] = count

Expand Down
59 changes: 58 additions & 1 deletion Watcher/Watcher/threats_watcher/datas/banned_words.csv
Original file line number Diff line number Diff line change
Expand Up @@ -796,4 +796,61 @@ xvjornadasccncert
yourself
zammad
zataz
zero-day
zero-day
importance
contracts
general
penetration
helps
premier
defense
lance
trust
strategy
tactics
benefits
practices
essential
identity
participating
controls
enhancing
weekly
episode
launches
utils
health
safety
landscape
collaboration
trends
steps
leaders
awards
managed
principal
spotlight
explained
comprehensive
performance
complete
prevention
publie
consultation
culture
essentials
basics
alerte
secteur
introducing
coffee
vistainfosec
requirement
requirements
ciblant
taosecurity
akamai
upguard
kaspersky
veracode
invicti
Loading

0 comments on commit b340d98

Please sign in to comment.