-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.yaml
28 lines (21 loc) · 2.23 KB
/
crawler.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
regexBannedDomains: "(?i)((facebook|twitter|reddit|instagram|google|youtube|etherscan|urldefense|thesexyouwant)\\.(com|org)|archive\\.org|repubblica\\.it|(^en)\\.wikipedia\\.org|anchor\\.fm|doi\\.org|github\\.com|blockscout|etherscan|polygonscan|qoinfaucet|pinterest)"
regexLinkBannedTokens: "(?i)(login|signup|search|pdf|\\.(pdf|ps|xls|ods|csv|json|png|jpg|gif|zip|tar|gz|iso|rar|mp3|wav|avi|mpeg|mpg|mp4|mov|docx|exe|7z|ppt|doc|rs|tgz))"
engStopWordsWOthe: |
a|and|be|have|i|in|of|that|to|with|from|is|on|up|for|should|even|why|by|during|we|could|but|about|as|or|this|at|not|all|other
|if|can|how|may|who|an|no|our|what|use|get|will|has|their|was|than|which|these|also|been|when|through|were|under|there|those|out|after|such|any|before
|here|only|some|its|where|into|like|would|against|between|most|so|over|because|now|while|since|however|non|without|among|both|another|still|just|way|very
|good|around|every|each|his|her|then|much|less|few|same|within|per|whether|cannot|doesn|isn|aren|across|ongoing|pre|anti|onset|don|it|you|are
engLowRelevancyWords: |
articles*|publications*|questions*|times|data|source|people|information|news*|search|content|home|sites*|best|well|pdf|files
|uploads|programs*|support|help|default|files*|available|please|including|websites*|related|work|number|days*|using|two|ref|first|daily|public|cases*|high|possible
|system|review|based|provide|results|additional|include|current|important|week|group|full|different|person|take|continue|national|needs*|millions*|requiremets*|working
|your|more|says|read|make|made|see|does|due|she|one|said|being|had|need|them|many|used|must|do|they|it|he|twitter|facebook|date|time|pages*|topics*|example
|things|real|wiki|early|year|currently|higher|specific|state|resources*|social|study|guidance|local|leave|online|centers*|email|blog|don|according|updates*d*|world
|cookies|javascript|google|internet|webinar|color|challenges*|click|faqs*|fig|cite|preprints*|note|isbn
specialStopwords: |
https*|www|php|aspx|index|en|html
|january|february|march|april|may|june|july|august|september|october|november|december
|com|org|gov|uk|edu|net|us|co|gob|au|ca
downloadTimeout: 8
# // Search engine to feed w/ new URLs
# addUrl: https://search.aibull.io/addUrlPost.php?sender=goCrawler&url=