Skip to content

Commit

Permalink
re-order features in basic normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
sepehrsoh committed Feb 12, 2025
1 parent f387ca0 commit 1205ce3
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
6 changes: 3 additions & 3 deletions internal/normalization.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,6 @@ func (n Normalize) BasicNormalizer(input string) string {
s = strings.ReplaceAll(s, nullString, emptyString)
stringInput := strings.ToLower(s)

if n.spaceCombiner {
stringInput = replaceMultiSpace(stringInput)
}
if n.urlRemover {
stringInput = removeURLs(stringInput)
}
Expand All @@ -121,6 +118,9 @@ func (n Normalize) BasicNormalizer(input string) string {
if n.endsWithEndOfLineChar {
stringInput = normalizeEndsWithEndOfLineChar(stringInput)
}
if n.spaceCombiner {
stringInput = replaceMultiSpace(stringInput)
}
if n.outerSpaceRemover { // should be last normalization step
stringInput = removeOuterSpace(stringInput)
}
Expand Down
9 changes: 9 additions & 0 deletions internal/normalization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ func TestNormalize_BasicNormalizer(t *testing.T) {
},
want: "بیست یک هزار",
},
{
name: "Should combine space after remove special characters",
args: args{
input: "مدرسه - ابتدایی",
spaceCombiner: true,
removeSpecialChars: true,
},
want: "مدرسه ابتدایی",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down

0 comments on commit 1205ce3

Please sign in to comment.