From 93fd71c449a7bf398268470259bd657a11626bfa Mon Sep 17 00:00:00 2001 From: Janto Dreijer Date: Sun, 15 Nov 2020 18:11:39 +0200 Subject: [PATCH] Fix issue #138 : Thu not recognised by regex --- datefinder/constants.py | 2 +- tests/test_extract_date_strings.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/datefinder/constants.py b/datefinder/constants.py index 19ad5dc..814c2dc 100644 --- a/datefinder/constants.py +++ b/datefinder/constants.py @@ -4,7 +4,7 @@ POSITIONNAL_TOKENS = r"next|last" DIGITS_PATTERN = r"\d+" DIGITS_SUFFIXES = r"st|th|rd|nd" -DAYS_PATTERN = "monday|tuesday|wednesday|thursday|friday|saturday|sunday|mandag|tirsdag|onsdag|torsdag|fredag|lørdag|søndag|mon|tue|tues|wed|thur|thurs|fri|sat|sun|man|tir|tirs|ons|tor|tors|fre|lør|søn" +DAYS_PATTERN = "monday|tuesday|wednesday|thursday|friday|saturday|sunday|mandag|tirsdag|onsdag|torsdag|fredag|lørdag|søndag|mon|tue|tues|wed|thu|thur|thurs|fri|sat|sun|man|tir|tirs|ons|tor|tors|fre|lør|søn" MONTHS_PATTERN = r"january|february|march|april|may|june|july|august|september|october|november|december|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januar|februar|marts|april|maj|juni|juli|august|september|oktober|november|december|jan\.?|ene\.?|feb\.?|mar\.?|apr\.?|abr\.?|may\.?|maj\.?|jun\.?|jul\.?|aug\.?|ago\.?|sep\.?|sept\.?|oct\.?|okt\.?|nov\.?|dec\.?|dic\.?" TIMEZONES_PATTERN = "ACDT|ACST|ACT|ACWDT|ACWST|ADDT|ADMT|ADT|AEDT|AEST|AFT|AHDT|AHST|AKDT|AKST|AKTST|AKTT|ALMST|ALMT|AMST|AMT|ANAST|ANAT|ANT|APT|AQTST|AQTT|ARST|ART|ASHST|ASHT|AST|AWDT|AWST|AWT|AZOMT|AZOST|AZOT|AZST|AZT|BAKST|BAKT|BDST|BDT|BEAT|BEAUT|BIOT|BMT|BNT|BORT|BOST|BOT|BRST|BRT|BST|BTT|BURT|CANT|CAPT|CAST|CAT|CAWT|CCT|CDDT|CDT|CEDT|CEMT|CEST|CET|CGST|CGT|CHADT|CHAST|CHDT|CHOST|CHOT|CIST|CKHST|CKT|CLST|CLT|CMT|COST|COT|CPT|CST|CUT|CVST|CVT|CWT|CXT|ChST|DACT|DAVT|DDUT|DFT|DMT|DUSST|DUST|EASST|EAST|EAT|ECT|EDDT|EDT|EEDT|EEST|EET|EGST|EGT|EHDT|EMT|EPT|EST|ET|EWT|FET|FFMT|FJST|FJT|FKST|FKT|FMT|FNST|FNT|FORT|FRUST|FRUT|GALT|GAMT|GBGT|GEST|GET|GFT|GHST|GILT|GIT|GMT|GST|GYT|HAA|HAC|HADT|HAE|HAP|HAR|HAST|HAT|HAY|HDT|HKST|HKT|HLV|HMT|HNA|HNC|HNE|HNP|HNR|HNT|HNY|HOVST|HOVT|HST|ICT|IDDT|IDT|IHST|IMT|IOT|IRDT|IRKST|IRKT|IRST|ISST|IST|JAVT|JCST|JDT|JMT|JST|JWST|KART|KDT|KGST|KGT|KIZST|KIZT|KMT|KOST|KRAST|KRAT|KST|KUYST|KUYT|KWAT|LHDT|LHST|LINT|LKT|LMT|LMT|LMT|LMT|LRT|LST|MADMT|MADST|MADT|MAGST|MAGT|MALST|MALT|MART|MAWT|MDDT|MDST|MDT|MEST|MET|MHT|MIST|MIT|MMT|MOST|MOT|MPT|MSD|MSK|MSM|MST|MUST|MUT|MVT|MWT|MYT|NCST|NCT|NDDT|NDT|NEGT|NEST|NET|NFT|NMT|NOVST|NOVT|NPT|NRT|NST|NT|NUT|NWT|NZDT|NZMT|NZST|OMSST|OMST|ORAST|ORAT|PDDT|PDT|PEST|PET|PETST|PETT|PGT|PHOT|PHST|PHT|PKST|PKT|PLMT|PMDT|PMMT|PMST|PMT|PNT|PONT|PPMT|PPT|PST|PT|PWT|PYST|PYT|QMT|QYZST|QYZT|RET|RMT|ROTT|SAKST|SAKT|SAMT|SAST|SBT|SCT|SDMT|SDT|SET|SGT|SHEST|SHET|SJMT|SLT|SMT|SRET|SRT|SST|STAT|SVEST|SVET|SWAT|SYOT|TAHT|TASST|TAST|TBIST|TBIT|TBMT|TFT|THA|TJT|TKT|TLT|TMT|TOST|TOT|TRST|TRT|TSAT|TVT|ULAST|ULAT|URAST|URAT|UTC|UYHST|UYST|UYT|UZST|UZT|VET|VLAST|VLAT|VOLST|VOLT|VOST|VUST|VUT|WARST|WART|WAST|WAT|WDT|WEDT|WEMT|WEST|WET|WFT|WGST|WGT|WIB|WIT|WITA|WMT|WSDT|WSST|WST|WT|XJT|YAKST|YAKT|YAPT|YDDT|YDT|YEKST|YEKST|YEKT|YEKT|YERST|YERT|YPT|YST|YWT|zzz" ## explicit north american timezones that get replaced diff --git a/tests/test_extract_date_strings.py b/tests/test_extract_date_strings.py index b3e9e4b..70a70ec 100644 --- a/tests/test_extract_date_strings.py +++ b/tests/test_extract_date_strings.py @@ -11,14 +11,18 @@ ['March 20, 2015 3:30 pm ACWDT in the parking lot', 'March 20, 2015 3:30 pm ACWDT'], ['blah blah March 20, 2015 3pm MADMT for some thing', 'March 20, 2015 3pm MADMT'], ['we need it back on Friday 2p.m. central standard time', 'on Friday 2p.m. central standard time'], - ['the big fight at 2p.m. mountain standard time on ufc.com', 'at 2p.m. mountain standard time on'] + ['the big fight at 2p.m. mountain standard time on ufc.com', 'at 2p.m. mountain standard time on'], + + # issue: Thu not recognised by regex #138 + ['starting Thursday 2020-11-05 13:50 GMT', 'Thursday 2020-11-05 13:50 GMT'], + ['starting Thu 2020-11-05 13:50 GMT', 'Thu 2020-11-05 13:50 GMT'], ]) def test_extract_date_strings(date_string, expected_match_date_string): dt = datefinder.DateFinder() for actual_date_string, indexes, captures in dt.extract_date_strings(date_string): logger.debug("actual={} expected={}".format(actual_date_string, expected_match_date_string)) assert actual_date_string == expected_match_date_string - assert len(captures.get('timezones',[])) > 0 + assert len(captures.get('timezones',[])) > 0, "timezone expected in result" # TODO: 'May 20th 2015 is nowhere near the other date' was not recognized as # a date string: this string produced no result, but there was no error