You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Thanks for sharing your codes!
I'm trying to test my tokenizers with your codes.
In tokenizer_exploration_utils.py, it seems like you're calculating the same fertility twice in plot_fertility(). And I can find the same part inplot_proportion_continuation(), plot_proportion_unks().
In plot_fertility(), line 332-340 and 342-353 look like they're doing the same calculation as getting language, fertility, and model in a dataframe.
Also, the function only returns the second dataframe. Is there any difference between the two parts?
Thanks :)
def plot_fertility(language_ud_dicts):
sns.set(style="whitegrid")
width = 512.14963
sns.set(
rc={
"axes.spines.bottom": True,
"axes.spines.left": True,
"axes.spines.right": False,
"axes.spines.top": False,
"font.size": 12,
"axes.labelsize": 12,
"axes.grid": False,
"legend.fontsize": 10,
"ytick.left": True,
"xtick.major.size": 8,
"ytick.major.size": 8,
"pgf.texsystem": "lualatex",
"text.latex.preamble": r"\usepackage{xcolor}",
"text.usetex": True,
},
style="whitegrid",
)
colors = ["indianred", "skyblue", "dodgerblue", "royalblue", "navy"]
sns.set_palette(sns.color_palette(colors))
sns.set_context("notebook") # use notebook or talk
titles = ["Mono", "mBERT"]
for i, language_ud_dict in enumerate(language_ud_dicts):
languages = []
values = []
for k, v in language_ud_dict.items():
languages.append(r"\textsc{%s}" % k)
values.append(np.mean(v["split_lengths"]))
d = {"languages": languages, "fertility": values}
df = pd.DataFrame(data=d).sort_values(ascending=True, by="fertility")
d = {"Language": [], "Fertility": [], "Model": []}
for i, language_ud_dict in enumerate(language_ud_dicts):
languages = []
values = []
for k, v in language_ud_dict.items():
languages.append(r"\textsc{%s}" % k)
values.append(np.mean(v["split_lengths"]))
d["Language"] += languages
d["Fertility"] += values
d["Model"] += [titles[i] for _ in values]
df = pd.DataFrame(data=d).sort_values(ascending=True, by="Language")
ax2 = sns.catplot(
kind="bar", x="Language", y="Fertility", hue="Model", data=df, legend=False, height=5, aspect=2.1
)
ax2.set_xlabels("")
ax2.set_ylabels(fontsize=30)
ax2.set_xticklabels(fontsize=30)
ax2.set(yticks=[0.0, 0.5, 1.0, 1.5, 2.0])
ax2.set_yticklabels([0.0, 0.5, 1.0, 1.5, 2.0], fontsize=28)
ax2.savefig("fertility.pdf", bbox_inches="tight")
return df
The text was updated successfully, but these errors were encountered:
Thanks for sharing your codes!
I'm trying to test my tokenizers with your codes.
In tokenizer_exploration_utils.py, it seems like you're calculating the same fertility twice in
plot_fertility()
. And I can find the same part inplot_proportion_continuation()
,plot_proportion_unks()
.In
plot_fertility()
, line 332-340 and 342-353 look like they're doing the same calculation as getting language, fertility, and model in a dataframe.Also, the function only returns the second dataframe. Is there any difference between the two parts?
Thanks :)
The text was updated successfully, but these errors were encountered: