This repository has been archived by the owner on Feb 3, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #258 from maarten-boot/development
convert file with supported tld's to Dict
- Loading branch information
Showing
15 changed files
with
3,793 additions
and
918 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,3 +72,4 @@ typescript | |
test.out | ||
diff.out | ||
tmp/ | ||
1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
TODO | ||
|
||
# pt is difficult it often gives no data, it works in aws frankfurt through | ||
# pt is difficult it often gives no data, it works in aws frankfurt though | ||
ERROR: output; missing nameserver 'ns1.dnscpanel.com.' for tld: pt | ||
ERROR: output; missing nameserver 'ns2.dnscpanel.com.' for tld: pt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#! /usr/bin/env python3 | ||
|
||
import sys | ||
import re | ||
from typing import ( | ||
# Optional, | ||
# List, | ||
Dict, | ||
) | ||
|
||
# most likely we can now introduce trailing whitespace trim on all lines from whois, | ||
# and simplefy trailing whitespace rules | ||
# as \r is already gone now and that was the most disticnt line ending | ||
# occasionally we need to detect \n\s+ for groups that belong together | ||
# mostly with indented blocks of nameservers | ||
|
||
# import whois | ||
from whois.tld_regexpr import ZZ | ||
|
||
|
||
def buildRegCollection(zz: Dict): | ||
regCollection = {} | ||
# get all regexes | ||
for name in zz: | ||
# print(name) | ||
z = zz[name] | ||
for key in z: | ||
if key is None: | ||
continue | ||
|
||
if key.startswith("_"): | ||
continue | ||
|
||
if key in ["extend"]: | ||
continue | ||
|
||
if key not in regCollection: | ||
regCollection[key] = {} | ||
|
||
reg = z[key] | ||
if reg is None: | ||
continue | ||
|
||
regCollection[key][reg] = None | ||
if isinstance(reg, str): | ||
regCollection[key][reg] = re.compile(reg, flags=re.IGNORECASE) | ||
|
||
return regCollection | ||
|
||
|
||
if __name__ == "__main__": | ||
regCollection = buildRegCollection(ZZ) | ||
|
||
for name in sorted(regCollection.keys()): | ||
print(f"## {name}", file=sys.stderr) | ||
for key in sorted(regCollection[name].keys()): | ||
if key: | ||
print(f"{name}: {key}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#! /usr/bin/env python3 | ||
|
||
# clone https://github.com/jophy/iana_tld_list in ./tmp | ||
|
||
import urllib.request | ||
|
||
from tmp.iana_tld_list.iana import IANA | ||
|
||
import whois | ||
from whois._1_query import _do_whois_query | ||
|
||
# allow verbose messages during testing (all on stderr) | ||
verbose = False | ||
|
||
# by default the all tld file will be refreshed ever 24 hours, | ||
# but you can force a new download anytime also | ||
forceDownloadTld = False | ||
|
||
# do you want to overwrite the results file ? | ||
overwrite = True | ||
|
||
# do you want interactive questions if files will be re-written? | ||
interactive = False | ||
|
||
# if autoProcessAll is True: all tld's will be processed (initial run > 20 minutes) | ||
autoProcessAll = False | ||
|
||
with_test_original = True | ||
|
||
dirName = "/tmp/iana_data" | ||
|
||
i = IANA( | ||
dirName=dirName, | ||
verbose=verbose, | ||
overwrite=overwrite, | ||
interactive=interactive, | ||
autoProcessAll=autoProcessAll, | ||
forceDownloadTld=forceDownloadTld, | ||
) | ||
|
||
# ge python whois known tld's and second level domains | ||
known = sorted(whois.validTlds()) | ||
|
||
# get iana data | ||
URL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" | ||
response = urllib.request.urlopen(URL) | ||
data = response.read().decode("utf-8").lower() | ||
dataList = sorted(data.splitlines()) | ||
|
||
# filter out known names and try to detect names not known by iana | ||
for name in known: | ||
if name in dataList: | ||
continue | ||
if "." in name: | ||
continue | ||
if name not in dataList: | ||
print(f"{name} tld name from python_whois is not known in IANA list") | ||
continue | ||
|
||
dataList2 = [] | ||
for name in dataList: | ||
if name in known: | ||
continue | ||
dataList2.append(name) | ||
|
||
# Try to auto detect new domains via IANA and some known common regex lists like .com | ||
found = {} | ||
for tld in dataList2: | ||
data, status = i.getInfoOnOneTld(tld) | ||
|
||
xtest = data and ("whois" in data) and (data["whois"]) and (data["whois"] != "NULL") | ||
if not xtest: | ||
print(f"no whois info for tld: {tld} {data}") | ||
continue | ||
|
||
wh = data["whois"] | ||
if wh.endswith(f".{tld}"): | ||
dd = wh.split(".")[-2:] | ||
else: | ||
dd = ["meta", tld] | ||
|
||
print(f"try: {tld}") | ||
zz = _do_whois_query( | ||
dd, | ||
ignore_returncode=False, | ||
server=wh, | ||
) | ||
|
||
pp = {"_server": wh, "extend": "com"} | ||
aDictToTestOverride = {tld: pp} | ||
|
||
whois.mergeExternalDictWithRegex(aDictToTestOverride) | ||
try: | ||
d = whois.query(".".join(dd)) | ||
if d: | ||
print(d.__dict__) | ||
if len(d.name_servers) > 0: | ||
found[tld] = pp | ||
print(f"## ZZ['{tld}'] = {found[tld]} # auto-detected via IANA tld") | ||
except Exception as e: | ||
print(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#! /usr/bin/env bash | ||
|
||
FILE="whois/tld_regexpr.py" | ||
FILE2="whois/tld_regexpr2.py" | ||
|
||
cat "$FILE" | | ||
perl -np -e ' | ||
# translate all tld to DICT and substitute for the real tld in case of _ | ||
s/^([a-z]+)_([a-z]+)\s+=/ZZ["$1.$2"] =/; | ||
s/^([a-z]+)\s+=/ZZ["$1"] =/; | ||
# if we refer to a tld also change _ to . | ||
s/"extend":\s+"(\w+)_(\w+)"/"extend": "$1.$2"/; | ||
' | | ||
tee "$FILE2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/python3 | ||
import whois | ||
|
||
Verbose = True | ||
|
||
""" | ||
initial testing had errors for these | ||
we DONT have xn--3ds443g 在线 (online) | ||
we DONT have xn--45q11c 八卦 (gossip) | ||
we DONT have xn--czru2d 商城 (mall) | ||
we DONT have xn--fiq228c5hs 中文网 (website) | ||
we DONT have xn--hxt814e 网店 (webshop) | ||
""" | ||
|
||
|
||
def t1(domain: str, text: str): | ||
print(f"{text}: {domain}") | ||
try: | ||
d = whois.query(domain) | ||
if d: | ||
print(d.__dict__) | ||
else: | ||
print(d) | ||
except Exception as e: | ||
print(domain, e) | ||
|
||
|
||
def xMain(): | ||
aDictToTestOverride = { | ||
"si": { # changing a existing one | ||
"domain_name": r"domain:\s+(.+)", | ||
"status": r"status:\s+(.+)", | ||
"registrar": r"registrar:\s+(.+)", | ||
"name_servers": r"nameserver:\s*(.+)", | ||
"creation_date": r"created:\s+(.+)", | ||
"expiration_date": None, | ||
"updated_date": None, | ||
"registrant_country": None, | ||
}, | ||
"mk": { # defining a non existant one, meanwhile this is now supported so the test is meaningless | ||
"extend": "com", | ||
"domain_name": r"domain:\s+(.+)", | ||
"status": r"status:\s+(.+)", | ||
"registrar": r"registrar:\s+(.+)", | ||
"name_servers": r"nserver:\s*(.+)", | ||
"creation_date": r"registered:\s+(.+)", | ||
"expiration_date": r"expire:\s+(.+)", | ||
"updated_date": r"changed:\s+(.+)", | ||
"registrant_country": None, | ||
"registrant": r"registrant:\s+(.+)", | ||
}, | ||
} | ||
|
||
domains = [ | ||
"google.si", | ||
"google.mk", | ||
] | ||
for domain in domains: | ||
t1(domain, "BEFORE") | ||
|
||
whois.mergeExternalDictWithRegex(aDictToTestOverride) | ||
|
||
for domain in domains: | ||
t1(domain, "AFTER") | ||
|
||
|
||
xMain() | ||
|
||
""" | ||
% Domain Information over Whois protocol | ||
% | ||
% Whoisd Server Version: 3.9.0 | ||
% Timestamp: Fri Nov 25 16:49:33 2022 | ||
domain: google.mk | ||
registrant: UNET-R11 | ||
admin-c: UNET-C12 | ||
nsset: UNET-NS191 | ||
registrar: UNET-REG | ||
registered: 13.05.2008 14:00:00 | ||
changed: 17.04.2014 12:50:32 | ||
expire: 13.05.2023 | ||
contact: UNET-R11 | ||
org: Google LLC | ||
name: Google LLC | ||
address: Amphiteatre Parkway 1600 | ||
address: Mountain View | ||
address: 94043 | ||
address: US | ||
phone: +1.6502530000 | ||
fax-no: +1.6502530000 | ||
e-mail: ccops@markmonitor.com | ||
registrar: UNET-REG | ||
created: 25.03.2014 11:48:02 | ||
changed: 29.09.2021 16:26:23 | ||
contact: UNET-C12 | ||
name: Mark Monitor Inc. | ||
address: 3540 East Longwing Lane Suite 300 | ||
address: Meridian | ||
address: 83646 | ||
address: US | ||
phone: +1.2083895740 | ||
e-mail: ccops@markmonitor.com | ||
registrar: UNET-REG | ||
created: 25.03.2014 11:48:00 | ||
changed: 19.11.2019 16:47:01 | ||
nsset: UNET-NS191 | ||
nserver: ns2.google.com | ||
nserver: ns1.google.com | ||
tech-c: UNET-C12 | ||
registrar: UNET-REG | ||
created: 17.04.2014 12:50:22 | ||
changed: 17.04.2014 21:02:14 | ||
""" |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.