Skip to content

Commit

Permalink
Merge pull request #9 from INGEOTEC/develop
Browse files Browse the repository at this point in the history
DialectId
  • Loading branch information
mgraffg authored Jun 15, 2024
2 parents 5915cfe + dc56be5 commit 3153dda
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 4 deletions.
3 changes: 2 additions & 1 deletion dialectid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@

__version__ = '0.0.2'

from dialectid.text_repr import BoW
from dialectid.text_repr import BoW
from dialectid.model import DialectId
83 changes: 83 additions & 0 deletions dialectid/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# MIT License

# Copyright (c) 2024 Eric Sadit Tellez Avila, Daniela Alejandra Moctezuma Ochoa, Luis Guillermo Ruiz Velazquez, Mario Graff Guerrero

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/

from typing import Union, List
from dataclasses import dataclass
import importlib
import numpy as np
from dialectid.utils import BOW, load_dialectid

@dataclass
class DialectId:
"""DialectId"""
lang: str='es'
voc_size_exponent: int=15

@property
def bow(self):
"""BoW"""

try:
return self._bow
except AttributeError:
path = BOW[self.lang].split('.')
module = '.'.join(path[:-1])
text_repr = importlib.import_module(module)
_ = getattr(text_repr, path[-1])(lang=self.lang,
voc_size_exponent=self.voc_size_exponent)
self._bow = _
return self._bow

@property
def weights(self):
"""Weights"""
try:
return self._weights
except AttributeError:
self._weights = load_dialectid(self.lang,
self.voc_size_exponent)
return self._weights

@property
def countries(self):
"""Countries"""
try:
return self._countries
except AttributeError:
_ = [x.labels[-1] for x in self.weights]
self._countries = np.array(_)
return self._countries

def decision_function(self, D: List[Union[dict, list, str]]) -> np.ndarray:
"""Decision function"""
if isinstance(D, str):
D = [D]
X = self.bow.transform(D)
hy = [w.decision_function(X) for w in self.weights]
return np.array(hy).T

def predict(self, D: List[Union[dict, list, str]]) -> np.ndarray:
"""Prediction"""

hy = self.decision_function(D)
return self.countries[hy.argmax(axis=1)]
68 changes: 68 additions & 0 deletions dialectid/tests/test_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# MIT License

# Copyright (c) 2024 Eric Sadit Tellez Avila, Daniela Alejandra Moctezuma Ochoa, Luis Guillermo Ruiz Velazquez, Mario Graff Guerrero

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/


def test_DialectId():
"""Test DialectId"""

from dialectid.model import DialectId
from dialectid import BoW

dialectid = DialectId(voc_size_exponent=15)
assert dialectid.lang == 'es' and dialectid.voc_size_exponent == 15
assert isinstance(dialectid.bow, BoW)


def test_DialectId_df():
"""Test DialectId"""

from dialectid.model import DialectId

dialectid = DialectId(voc_size_exponent=15)
hy = dialectid.decision_function('comiendo tacos')
assert hy.shape == (1, 20)
assert hy.argmax(axis=1)[0] == 0


def test_countries():
"""Test countries"""

from dialectid.model import DialectId

dialectid = DialectId(voc_size_exponent=15)
assert len(dialectid.countries) == 20
assert dialectid.countries[0] == 'mx'


def test_predict():
"""Test predict"""

from dialectid.model import DialectId

dialectid = DialectId(voc_size_exponent=15)
countries = dialectid.predict('comiendo tacos')
assert countries[0] == 'mx'
countries = dialectid.predict(['comiendo tacos',
'tomando vino'])
assert countries.shape == (2, )

16 changes: 15 additions & 1 deletion dialectid/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,18 @@ def test_BOW():
path = BOW[lang].split('.')
module = '.'.join(path[:-1])
text_repr = importlib.import_module(module)
instance = getattr(text_repr, path[-1])
instance = getattr(text_repr, path[-1])


def test_load_dialectid():
"""Test dialectid"""

from EvoMSA.utils import Linear
from dialectid.utils import COUNTRIES
models = utils.load_dialectid('es', 15)
assert len(models) == 20
assert isinstance(models[0], Linear)
for model, cntry in zip(models, COUNTRIES['es']):
assert model.labels[-1] == cntry


17 changes: 15 additions & 2 deletions dialectid/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
# SOFTWARE.
# https://www.cia.gov/the-world-factbook/about/archives/2021/field/languages/

from EvoMSA.utils import Download
from microtc.utils import Counter
from EvoMSA.utils import Download, Linear
from microtc.utils import Counter, tweet_iterator
from os.path import join, dirname, isdir, isfile
import gzip
import os
Expand Down Expand Up @@ -145,3 +145,16 @@ def load(filename):
data['counter'] = Counter(_["dict"], _["update_calls"])
return data


def load_dialectid(lang, dim):
"""Load url"""

diroutput = join(dirname(__file__), 'models')
if not isdir(diroutput):
os.mkdir(diroutput)
filename = f'dialectid_{lang}_{dim}.json.gz'
output = join(diroutput, filename)
if not isfile(output):
Download(f'{BASEURL}/{filename}', output)
_ = [Linear(**params) for params in tweet_iterator(output)]
return _

0 comments on commit 3153dda

Please sign in to comment.