Merge pull request #258 from maarten-boot/development

convert file with supported tld's to Dict
mboot-github · Jan 27, 2023 · d42a7f5 · d42a7f5
2 parents eceae65 + 7fd6d6b
commit d42a7f5
Show file tree

Hide file tree

Showing 15 changed files with 3,793 additions and 918 deletions.
diff --git a/.gitignore b/.gitignore
@@ -72,3 +72,4 @@ typescript
 test.out
 diff.out
 tmp/
+1
diff --git a/DONE b/DONE
@@ -40,3 +40,8 @@ DONE
 
   -  add nic to default test group for makeTestdataAll.sh
 
+  -  convert the list of tld to Dict
+  -  allow override or change and adding new domains without needing a new version directly
+  -  tested with existing testdomains, all reponses will now respond with the true tld not the one with a underscore
+
+  - add simple autodetect based on tld from IANA, try to use the .com patterns to se if we get someting usefull
diff --git a/README.md b/README.md
@@ -76,6 +76,15 @@ Raise an issue https://github.com/DannyCork/python-whois/issues/new
 2023-01-18: sorrowless
  * add an opportunity to specify maximum cache age
 
+2023-01-25: maarten_boot
+ * convert the tld file to a Dict, we now no longer need a mappper for python keywords or second level domains.
+ * utf8 level domains also need no mapper anymore an can be added as is with a translation to xn--<something>
+ * added xn-- tlds for all known utf-8 domains we currently have
+ * we can now add new domains on the fly or change them:  whois.mergeExternalDictWithRegex(aDictToOverride) see example testExtend.py
+
+2023-01-27: maarten_boot
+ * add autodetect via iana tld file (this has only tld's)
+
 ## Support
  * Python 3.x is supported.
  * Python 2.x IS NOT supported.
diff --git a/TODO b/TODO
@@ -1,5 +1,5 @@
 TODO
 
-# pt is difficult it often gives no data, it works in aws frankfurt through
+# pt is difficult it often gives no data, it works in aws frankfurt though
 ERROR: output;  missing nameserver 'ns1.dnscpanel.com.' for tld: pt
 ERROR: output;  missing nameserver 'ns2.dnscpanel.com.' for tld: pt
diff --git a/analize_patterns.py b/analize_patterns.py
@@ -0,0 +1,58 @@
+#! /usr/bin/env python3
+
+import sys
+import re
+from typing import (
+    # Optional,
+    # List,
+    Dict,
+)
+
+# most likely we can now introduce trailing whitespace trim on all lines from whois,
+# and simplefy trailing whitespace rules
+# as \r is already gone now and that was the most disticnt line ending
+# occasionally we need to detect \n\s+ for groups that belong together
+# mostly with indented blocks of nameservers
+
+# import whois
+from whois.tld_regexpr import ZZ
+
+
+def buildRegCollection(zz: Dict):
+    regCollection = {}
+    # get all regexes
+    for name in zz:
+        # print(name)
+        z = zz[name]
+        for key in z:
+            if key is None:
+                continue
+
+            if key.startswith("_"):
+                continue
+
+            if key in ["extend"]:
+                continue
+
+            if key not in regCollection:
+                regCollection[key] = {}
+
+            reg = z[key]
+            if reg is None:
+                continue
+
+            regCollection[key][reg] = None
+            if isinstance(reg, str):
+                regCollection[key][reg] = re.compile(reg, flags=re.IGNORECASE)
+
+    return regCollection
+
+
+if __name__ == "__main__":
+    regCollection = buildRegCollection(ZZ)
+
+    for name in sorted(regCollection.keys()):
+        print(f"## {name}", file=sys.stderr)
+        for key in sorted(regCollection[name].keys()):
+            if key:
+                print(f"{name}: {key}")
diff --git a/compare_known_tld.py b/compare_known_tld.py
@@ -0,0 +1,101 @@
+#! /usr/bin/env python3
+
+# clone https://github.com/jophy/iana_tld_list in ./tmp
+
+import urllib.request
+
+from tmp.iana_tld_list.iana import IANA
+
+import whois
+from whois._1_query import _do_whois_query
+
+# allow verbose messages during testing (all on stderr)
+verbose = False
+
+# by default the all tld file will be refreshed ever 24 hours,
+# but you can force a new download anytime also
+forceDownloadTld = False
+
+# do you want to overwrite the results file ?
+overwrite = True
+
+# do you want interactive questions if files will be re-written?
+interactive = False
+
+# if autoProcessAll is True: all tld's will be processed (initial run > 20 minutes)
+autoProcessAll = False
+
+with_test_original = True
+
+dirName = "/tmp/iana_data"
+
+i = IANA(
+    dirName=dirName,
+    verbose=verbose,
+    overwrite=overwrite,
+    interactive=interactive,
+    autoProcessAll=autoProcessAll,
+    forceDownloadTld=forceDownloadTld,
+)
+
+# ge python whois known tld's and second level domains
+known = sorted(whois.validTlds())
+
+# get iana data
+URL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
+response = urllib.request.urlopen(URL)
+data = response.read().decode("utf-8").lower()
+dataList = sorted(data.splitlines())
+
+# filter out known names and try to detect names not known by iana
+for name in known:
+    if name in dataList:
+        continue
+    if "." in name:
+        continue
+    if name not in dataList:
+        print(f"{name} tld name from python_whois is not known in IANA list")
+        continue
+
+dataList2 = []
+for name in dataList:
+    if name in known:
+        continue
+    dataList2.append(name)
+
+# Try to auto detect new domains via IANA and some known common regex lists like .com
+found = {}
+for tld in dataList2:
+    data, status = i.getInfoOnOneTld(tld)
+
+    xtest = data and ("whois" in data) and (data["whois"]) and (data["whois"] != "NULL")
+    if not xtest:
+        print(f"no whois info for tld: {tld} {data}")
+        continue
+
+    wh = data["whois"]
+    if wh.endswith(f".{tld}"):
+        dd = wh.split(".")[-2:]
+    else:
+        dd = ["meta", tld]
+
+    print(f"try: {tld}")
+    zz = _do_whois_query(
+        dd,
+        ignore_returncode=False,
+        server=wh,
+    )
+
+    pp = {"_server": wh, "extend": "com"}
+    aDictToTestOverride = {tld: pp}
+
+    whois.mergeExternalDictWithRegex(aDictToTestOverride)
+    try:
+        d = whois.query(".".join(dd))
+        if d:
+            print(d.__dict__)
+            if len(d.name_servers) > 0:
+                found[tld] = pp
+                print(f"## ZZ['{tld}'] = {found[tld]} # auto-detected via IANA tld")
+    except Exception as e:
+        print(e)
diff --git a/convert_to_dict.sh b/convert_to_dict.sh
@@ -0,0 +1,14 @@
+#! /usr/bin/env bash
+
+FILE="whois/tld_regexpr.py"
+FILE2="whois/tld_regexpr2.py"
+
+cat "$FILE" |
+perl -np -e '
+# translate all tld to DICT and substitute for the real tld in case of _
+s/^([a-z]+)_([a-z]+)\s+=/ZZ["$1.$2"] =/;
+s/^([a-z]+)\s+=/ZZ["$1"] =/;
+# if we refer to a tld also change _ to .
+s/"extend":\s+"(\w+)_(\w+)"/"extend": "$1.$2"/;
+' |
+tee "$FILE2"
diff --git a/testExtend.py b/testExtend.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python3
+import whois
+
+Verbose = True
+
+"""
+initial testing had errors for these
+we DONT have xn--3ds443g 在线 (online)
+we DONT have xn--45q11c 八卦 (gossip)
+we DONT have xn--czru2d 商城 (mall)
+we DONT have xn--fiq228c5hs 中文网 (website)
+we DONT have xn--hxt814e 网店 (webshop)
+"""
+
+
+def t1(domain: str, text: str):
+    print(f"{text}: {domain}")
+    try:
+        d = whois.query(domain)
+        if d:
+            print(d.__dict__)
+        else:
+            print(d)
+    except Exception as e:
+        print(domain, e)
+
+
+def xMain():
+    aDictToTestOverride = {
+        "si": {  # changing a existing one
+            "domain_name": r"domain:\s+(.+)",
+            "status": r"status:\s+(.+)",
+            "registrar": r"registrar:\s+(.+)",
+            "name_servers": r"nameserver:\s*(.+)",
+            "creation_date": r"created:\s+(.+)",
+            "expiration_date": None,
+            "updated_date": None,
+            "registrant_country": None,
+        },
+        "mk": {  # defining a non existant one, meanwhile this is now supported so the test is meaningless
+            "extend": "com",
+            "domain_name": r"domain:\s+(.+)",
+            "status": r"status:\s+(.+)",
+            "registrar": r"registrar:\s+(.+)",
+            "name_servers": r"nserver:\s*(.+)",
+            "creation_date": r"registered:\s+(.+)",
+            "expiration_date": r"expire:\s+(.+)",
+            "updated_date": r"changed:\s+(.+)",
+            "registrant_country": None,
+            "registrant": r"registrant:\s+(.+)",
+        },
+    }
+
+    domains = [
+        "google.si",
+        "google.mk",
+    ]
+    for domain in domains:
+        t1(domain, "BEFORE")
+
+    whois.mergeExternalDictWithRegex(aDictToTestOverride)
+
+    for domain in domains:
+        t1(domain, "AFTER")
+
+
+xMain()
+
+"""
+
+% Domain Information over Whois protocol
+%
+% Whoisd Server Version: 3.9.0
+% Timestamp: Fri Nov 25 16:49:33 2022
+
+domain:       google.mk
+registrant:   UNET-R11
+admin-c:      UNET-C12
+nsset:        UNET-NS191
+registrar:    UNET-REG
+registered:   13.05.2008 14:00:00
+changed:      17.04.2014 12:50:32
+expire:       13.05.2023
+
+contact:      UNET-R11
+org:          Google LLC
+name:         Google LLC
+address:      Amphiteatre Parkway 1600
+address:      Mountain View
+address:      94043
+address:      US
+phone:        +1.6502530000
+fax-no:       +1.6502530000
+e-mail:       ccops@markmonitor.com
+registrar:    UNET-REG
+created:      25.03.2014 11:48:02
+changed:      29.09.2021 16:26:23
+
+contact:      UNET-C12
+name:         Mark Monitor Inc.
+address:      3540 East Longwing Lane Suite 300
+address:      Meridian
+address:      83646
+address:      US
+phone:        +1.2083895740
+e-mail:       ccops@markmonitor.com
+registrar:    UNET-REG
+created:      25.03.2014 11:48:00
+changed:      19.11.2019 16:47:01
+
+nsset:        UNET-NS191
+nserver:      ns2.google.com
+nserver:      ns1.google.com
+tech-c:       UNET-C12
+registrar:    UNET-REG
+created:      17.04.2014 12:50:22
+changed:      17.04.2014 21:02:14
+
+
+"""
diff --git a/testdata/meta.rk/input b/testdata/meta.rk/input
diff --git a/testdata/meta.rk/output b/testdata/meta.rk/output