Skip to content

Commit

Permalink
Add options for ambiguous integer dates
Browse files Browse the repository at this point in the history
  • Loading branch information
akoumjian committed Aug 2, 2020
1 parent 66d5fe7 commit 9ee6ef9
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 185 deletions.
58 changes: 42 additions & 16 deletions datefinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@ class DateFinder(object):
Locates dates in a text
"""

def __init__(self, base_date=None):
def __init__(self, base_date=None, first="month"):
self.base_date = base_date
self.dayfirst = False
self.yearfirst = False
if first == "day":
self.dayfirst = True
if first == "year":
self.yearfirst = True

def find_dates(self, text, source=False, index=False, strict=False):

Expand Down Expand Up @@ -99,7 +105,12 @@ def parse_date_string(self, date_string, captures):
# For well formatted string, we can already let dateutils parse them
# otherwise self._find_and_replace method might corrupt them
try:
as_dt = parser.parse(date_string, default=self.base_date)
as_dt = parser.parse(
date_string,
default=self.base_date,
dayfirst=self.dayfirst,
yearfirst=self.yearfirst,
)
except (ValueError, OverflowError):
# replace tokens that are problematic for dateutil
date_string, tz_string = self._find_and_replace(date_string, captures)
Expand All @@ -113,7 +124,12 @@ def parse_date_string(self, date_string, captures):

try:
logger.debug("Parsing {0} with dateutil".format(date_string))
as_dt = parser.parse(date_string, default=self.base_date)
as_dt = parser.parse(
date_string,
default=self.base_date,
dayfirst=self.dayfirst,
yearfirst=self.yearfirst,
)
except Exception as e:
logger.debug(e)
as_dt = None
Expand All @@ -139,9 +155,11 @@ def extract_date_strings_inner(self, text, text_start=0, strict=False):
if rng and len(rng) > 1:
range_strings = []
for range_str in rng:
range_strings.extend(self.extract_date_strings_inner(range_str[0],
text_start=range_str[1][0],
strict=strict))
range_strings.extend(
self.extract_date_strings_inner(
range_str[0], text_start=range_str[1][0], strict=strict
)
)
for range_string in range_strings:
yield range_string
return
Expand Down Expand Up @@ -169,7 +187,7 @@ def extract_date_strings_inner(self, text, text_start=0, strict=False):
if len(digits) == 3: # 12-05-2015
complete = True
elif (len(months) == 1) and (
len(digits) == 2
len(digits) == 2
): # 19 February 2013 year 09:10
complete = True

Expand All @@ -185,12 +203,12 @@ def extract_date_strings_inner(self, text, text_start=0, strict=False):
yield match_str, indices, captures

def tokenize_string(self, text):
'''
"""
Get matches from source text. Method merge_tokens will later compose
potential date strings out of these matches.
:param text: source text like 'the big fight at 2p.m. mountain standard time on ufc.com'
:return: [(match_text, match_group, {match.capturesdict()}), ...]
'''
"""
items = []

last_index = 0
Expand All @@ -202,19 +220,19 @@ def tokenize_string(self, text):
group = self.get_token_group(captures)

if indices[0] > last_index:
items.append((text[last_index:indices[0]], '', {}))
items.append((text[last_index : indices[0]], "", {}))
items.append((match_str, group, captures))
last_index = indices[1]
if last_index < len(text):
items.append((text[last_index:len(text)], '', {}))
items.append((text[last_index : len(text)], "", {}))
return items

def merge_tokens(self, tokens):
'''
"""
Makes potential date strings out of matches, got from tokenize_string method.
:param tokens: [(match_text, match_group, {match.capturesdict()}), ...]
:return: potential date strings
'''
"""
MIN_MATCHES = 3
fragments = []
frag = DateFragment()
Expand Down Expand Up @@ -264,7 +282,7 @@ def get_token_group(captures):
lst = captures.get(gr)
if lst and len(lst) > 0:
return gr
return ''
return ""

@staticmethod
def split_date_range(text):
Expand All @@ -284,7 +302,9 @@ def split_date_range(text):
return parts


def find_dates(text, source=False, index=False, strict=False, base_date=None):
def find_dates(
text, source=False, index=False, strict=False, base_date=None, first="month"
):
"""
Extract datetime strings from text
Expand All @@ -306,9 +326,15 @@ def find_dates(text, source=False, index=False, strict=False, base_date=None):
:param base_date:
Set a default base datetime when parsing incomplete dates
:type base_date: datetime
:param first:
Whether to interpret the the first value in an ambiguous 3-integer date
(01/02/03) as the month, day, or year. Values can be `month`, `day`, `year`.
Default is `month`.
:type first: str|unicode
:return: Returns a generator that produces :mod:`datetime.datetime` objects,
or a tuple with the source text and index, if requested
"""
date_finder = DateFinder(base_date=base_date)
date_finder = DateFinder(base_date=base_date, first=first)
return date_finder.find_dates(text, source=source, index=index, strict=strict)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version="0.7.1",
version="0.7.2.dev0",
description="Extract datetime objects from strings",
long_description=long_description,
# The project's main homepage.
Expand Down
166 changes: 103 additions & 63 deletions tests/test_find_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,76 +4,116 @@
import pytz
import sys
import logging

logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
logger = logging.getLogger(__name__)

today = datetime.today()


@pytest.mark.parametrize('input_text, expected_date', [
## English Dates
#('[Sept] 04, 2014.', datetime(2014, 9, 4)),
('Tuesday Jul 22, 2014', datetime(2014, 7, 22)),
#('10:04am EDT', datetime(2012, 11, 13, 14, 4)),
#('Friday', datetime(2012, 11, 9)),
#('November 19, 2014 at noon', datetime(2014, 11, 19, 12, 0)),
('December 13, 2014 at midnight', datetime(2014, 12, 13, 0, 0)),
#('Nov 25 2014 10:17 pm EST', datetime(2014, 11, 26, 3, 17)),
#('Wed Aug 05 12:00:00 EDT 2015', datetime(2015, 8, 5, 16, 0)),
('April 9, 2013 at 6:11 a.m.', datetime(2013, 4, 9, 6, 11)),
('Aug. 9, 2012 at 2:57 p.m.', datetime(2012, 8, 9, 14, 57)),
('December 10, 2014, 11:02:21 pm', datetime(2014, 12, 10, 23, 2, 21)),
('8:25 a.m. Dec. 12, 2014', datetime(2014, 12, 12, 8, 25)),
('2:21 p.m., December 11, 2014', datetime(2014, 12, 11, 14, 21)),
('Fri, 12 Dec 2014 10:55:50', datetime(2014, 12, 12, 10, 55, 50)),
#('20 Mar 2013 10h11', datetime(2013, 3, 20, 10, 11)),
('10:06am Dec 11, 2014', datetime(2014, 12, 11, 10, 6)),
('September 2nd, 1998', datetime(1998, 9, 2)),
('May 5, 2010 to July 10, 2011', [datetime(2010, 5, 5), datetime(2011, 7, 10)]),
#('19 February 2013 year 09:10', datetime(2013, 2, 19, 9, 10)),
# Numeric dates
('06-17-2014', datetime(2014, 6, 17)),
('13/03/2014', datetime(2014, 3, 13)),
('2016-02-04T20:16:26+00:00', datetime(2016, 2, 4, 20, 16, 26, tzinfo=pytz.utc)),
#('11. 12. 2014, 08:45:39', datetime(2014, 11, 12, 8, 45, 39)),
("2017-02-03T09:04:08Z to 2017-02-03T09:04:09Z", [
datetime(2017, 2, 3, 9, 4, 8, tzinfo=pytz.utc),
datetime(2017, 2, 3, 9, 4, 9, tzinfo=pytz.utc)
]),
# dates from issue https://github.com/akoumjian/datefinder/issues/14
("i am looking for a date june 4th 1996 to july 3rd 2013",[
datetime(1996, 6, 4),
datetime(2013, 7, 3)
]),
("october 27 1994 to be put into effect on june 1 1995",[
datetime(1994, 10, 27),
datetime(1995, 6, 1)
]),
# Simple date range
("31/08/2012 to 30/08/2013",[
datetime(2012, 8, 31),
datetime(2013, 8, 30)
]),
# Z dates with and without millis, from https://github.com/akoumjian/datefinder/issues/37
("2017-02-03T09:04:08.001Z", datetime(2017, 2, 3, 9, 4, 8, 1000, tzinfo=pytz.utc)),
("2017-02-03T09:04:08,00123Z", datetime(2017, 2, 3, 9, 4, 8, 1230, tzinfo=pytz.utc)),
("2017-02-03T09:04:08Z", datetime(2017, 2, 3, 9, 4, 8, tzinfo=pytz.utc)),
# Year only strings, from https://github.com/akoumjian/datefinder/issues/96
("Dutta is the recipient of Femina Miss India Universe title in 2004.", datetime(2004, today.month, today.day)),
("she said that she hit depression after being traumatized on the sets of \"Horn OK\" in 2008.", datetime(2008, today.month, today.day)),
# https://github.com/akoumjian/datefinder/issues/63
("12th day of December, 2001", datetime(2001, 12, 12)),
])
def test_find_date_strings(input_text, expected_date):
if isinstance(expected_date,list):
matches = list(datefinder.find_dates(input_text))
@pytest.mark.parametrize(
("input_text", "expected_date", "first"),
[
## English Dates
# ('[Sept] 04, 2014.', datetime(2014, 9, 4), "month"),
("Tuesday Jul 22, 2014", datetime(2014, 7, 22), "month"),
# ('10:04am EDT', datetime(2012, 11, 13, 14, 4), "month"),
# ('Friday', datetime(2012, 11, 9), "month"),
# ('November 19, 2014 at noon', datetime(2014, 11, 19, 12, 0), "month"),
("December 13, 2014 at midnight", datetime(2014, 12, 13, 0, 0), "month"),
# ('Nov 25 2014 10:17 pm EST', datetime(2014, 11, 26, 3, 17), "month"),
# ('Wed Aug 05 12:00:00 EDT 2015', datetime(2015, 8, 5, 16, 0), "month"),
("April 9, 2013 at 6:11 a.m.", datetime(2013, 4, 9, 6, 11), "month"),
("Aug. 9, 2012 at 2:57 p.m.", datetime(2012, 8, 9, 14, 57), "month"),
("December 10, 2014, 11:02:21 pm", datetime(2014, 12, 10, 23, 2, 21), "month"),
("8:25 a.m. Dec. 12, 2014", datetime(2014, 12, 12, 8, 25), "month"),
("2:21 p.m., December 11, 2014", datetime(2014, 12, 11, 14, 21), "month"),
("Fri, 12 Dec 2014 10:55:50", datetime(2014, 12, 12, 10, 55, 50), "month"),
# ('20 Mar 2013 10h11', datetime(2013, 3, 20, 10, 11), "month"),
("10:06am Dec 11, 2014", datetime(2014, 12, 11, 10, 6), "month"),
("September 2nd, 1998", datetime(1998, 9, 2), "month"),
(
"May 5, 2010 to July 10, 2011",
[datetime(2010, 5, 5), datetime(2011, 7, 10)],
"month",
),
# ('19 February 2013 year 09:10', datetime(2013, 2, 19, 9, 10), "month"),
# Numeric dates
("06-17-2014", datetime(2014, 6, 17), "month"),
("13/03/2014", datetime(2014, 3, 13), "month"),
(
"2016-02-04T20:16:26+00:00",
datetime(2016, 2, 4, 20, 16, 26, tzinfo=pytz.utc),
"month",
),
# ('11. 12. 2014, 08:45:39', datetime(2014, 11, 12, 8, 45, 39)),
(
"2017-02-03T09:04:08Z to 2017-02-03T09:04:09Z",
[
datetime(2017, 2, 3, 9, 4, 8, tzinfo=pytz.utc),
datetime(2017, 2, 3, 9, 4, 9, tzinfo=pytz.utc),
],
"month",
),
# dates from issue https://github.com/akoumjian/datefinder/issues/14
(
"i am looking for a date june 4th 1996 to july 3rd 2013",
[datetime(1996, 6, 4), datetime(2013, 7, 3)],
"month",
),
(
"october 27 1994 to be put into effect on june 1 1995",
[datetime(1994, 10, 27), datetime(1995, 6, 1)],
"month",
),
# Simple date range
(
"31/08/2012 to 30/08/2013",
[datetime(2012, 8, 31), datetime(2013, 8, 30)],
"month",
),
# Z dates with and without millis, from https://github.com/akoumjian/datefinder/issues/37
(
"2017-02-03T09:04:08.001Z",
datetime(2017, 2, 3, 9, 4, 8, 1000, tzinfo=pytz.utc),
"month",
),
(
"2017-02-03T09:04:08,00123Z",
datetime(2017, 2, 3, 9, 4, 8, 1230, tzinfo=pytz.utc),
"month",
),
(
"2017-02-03T09:04:08Z",
datetime(2017, 2, 3, 9, 4, 8, tzinfo=pytz.utc),
"month",
),
# Year only strings, from https://github.com/akoumjian/datefinder/issues/96
(
"Dutta is the recipient of Femina Miss India Universe title in 2004.",
datetime(2004, today.month, today.day),
"month",
),
(
'she said that she hit depression after being traumatized on the sets of "Horn OK" in 2008.',
datetime(2008, today.month, today.day),
"month",
),
# https://github.com/akoumjian/datefinder/issues/63
("12th day of December, 2001", datetime(2001, 12, 12), "month"),
("01/02/03", datetime(2003, 1, 2, 0, 0, 0, 0), "month"),
("01/02/03", datetime(2003, 2, 1, 0, 0, 0, 0), "day"),
("01/02/03", datetime(2001, 2, 3, 0, 0, 0, 0), "year"),
],
)
def test_find_date_strings(input_text, expected_date, first):
if isinstance(expected_date, list):
matches = list(datefinder.find_dates(input_text, first=first))
assert matches == expected_date
else:
return_date = None
for return_date in datefinder.find_dates(input_text):
for return_date in datefinder.find_dates(input_text, first=first):
assert return_date == expected_date
assert return_date is not None, 'Did not find date for test line: "{}"'.format(input_text) # handles dates that were never matched
assert return_date is not None, 'Did not find date for test line: "{}"'.format(
input_text
) # handles dates that were never matched
Loading

0 comments on commit 9ee6ef9

Please sign in to comment.