1
0
mirror of https://github.com/AfterShip/email-verifier.git synced 2025-02-06 11:01:17 +00:00

MOD: use go-edlib pkg to calculate string distance

This commit is contained in:
Herbert Lu 2021-04-12 11:02:06 +08:00
parent 5cf81f1568
commit d1d0ada367
9 changed files with 10 additions and 175 deletions

View File

@ -19,6 +19,9 @@ const (
disposableDataURL = "https://raw.githubusercontent.com/disposable/disposable-email-domains/master/domains.json"
gravatarBaseUrl = "https://www.gravatar.com/avatar/"
gravatarDefaultMd5 = "d5fe5cbcc31cff5f8ac010db72eb000c"
domainThreshold float32 = 0.82
secondLevelThreshold float32 = 0.82
topLevelThreshold float32 = 0.6
)

1
go.mod
View File

@ -3,6 +3,7 @@ module github.com/AfterShip/email-verifier
go 1.15
require (
github.com/hbollon/go-edlib v1.3.3
github.com/kr/pretty v0.2.1 // indirect
github.com/stretchr/testify v1.7.0
golang.org/x/net v0.0.0-20201207224615-747e23833adb

2
go.sum
View File

@ -2,6 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542 h1:2VTzZjLZBgl62/EtslCrtky5vbi9dd7HrQPQIx6wqiw=
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542/go.mod h1:Ow0tF8D4Kplbc8s8sSb3V2oUCygFHVp8gC3Dn6U4MNI=
github.com/hbollon/go-edlib v1.3.3 h1:p4mih0w90lLkujQ5coYQ4X9DdE9NhTV/EtOSqdA2DKM=
github.com/hbollon/go-edlib v1.3.3/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=

View File

@ -1,44 +0,0 @@
package emailverifier
// levenshteinDistance calculate the distance between two string
// Refer to the implementation of https://github.com/hbollon/go-edlib/blob/master/levenshtein.go
func levenshteinDistance(str1, str2 string) int {
// Convert string parameters to rune arrays to be compatible with non-ASCII
runeStr1 := []rune(str1)
runeStr2 := []rune(str2)
// Get and store length of these strings
runeStr1len := len(runeStr1)
runeStr2len := len(runeStr2)
if runeStr1len == 0 {
return runeStr2len
} else if runeStr2len == 0 {
return runeStr1len
} else if equal(runeStr1, runeStr2) {
return 0
}
column := make([]int, runeStr1len+1)
for y := 1; y <= runeStr1len; y++ {
column[y] = y
}
for x := 1; x <= runeStr2len; x++ {
column[0] = x
lastkey := x - 1
for y := 1; y <= runeStr1len; y++ {
oldkey := column[y]
var i int
if runeStr1[y-1] != runeStr2[x-1] {
i = 1
}
column[y] = min(
min(column[y]+1, // insert
column[y-1]+1), // delete
lastkey+i) // substitution
lastkey = oldkey
}
}
return column[runeStr1len]
}

View File

@ -1,32 +0,0 @@
package emailverifier
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestLevenshteinDistanceOK1(t *testing.T) {
s1, s2 := "gmail.com", "gmaii.com"
assert.Equal(t, levenshteinDistance(s1, s2), 1)
}
func TestLevenshteinDistanceOK2(t *testing.T) {
s1, s2 := "gmail.com", "gmai.com"
assert.Equal(t, levenshteinDistance(s1, s2), 1)
}
func TestLevenshteinDistanceOK3(t *testing.T) {
s1, s2 := "", "abcde"
assert.Equal(t, levenshteinDistance(s1, s2), 5)
}
func TestLevenshteinDistanceOK4(t *testing.T) {
s1, s2 := "abcde", "abcde"
assert.Equal(t, levenshteinDistance(s1, s2), 0)
}
func TestLevenshteinDistanceOK5(t *testing.T) {
s1, s2 := "distance", "difference"
assert.Equal(t, levenshteinDistance(s1, s2), 5)
}

View File

@ -2,12 +2,8 @@ package emailverifier
import (
"strings"
)
var (
domainThreshold float32 = 0.82
secondLevelThreshold float32 = 0.82
topLevelThreshold float32 = 0.6
"github.com/hbollon/go-edlib"
)
// SuggestDomain checks if domain has a typo and suggests a similar correct domain from metadata,
@ -74,7 +70,7 @@ func findClosestDomain(domain string, domains map[string]bool, threshold float32
return domain
}
dist := stringsSimilarity(domain, d, levenshteinDistance(domain, d))
dist, _ := edlib.StringsSimilarity(domain, d, edlib.Levenshtein)
if dist > maxDist {
maxDist = dist
closestDomain = d
@ -87,12 +83,3 @@ func findClosestDomain(domain string, domains map[string]bool, threshold float32
return ""
}
// stringsSimilarity returns a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
func stringsSimilarity(str1 string, str2 string, distance int) float32 {
// Compare strings length and make a matching percentage between them
if len(str1) >= len(str2) {
return float32(len(str1)-distance) / float32(len(str1))
}
return float32(len(str2)-distance) / float32(len(str2))
}

View File

@ -6,16 +6,6 @@ import (
"github.com/stretchr/testify/assert"
)
func TestStringsSimilaritystr1Longer(t *testing.T) {
s1, s2 := "Automizely", "AfterShip"
assert.Greater(t, stringsSimilarity(s1, s2, 3), float32(0.5))
}
func TestStringsSimilaritystr2Longer(t *testing.T) {
s2, s1 := "Automizely", "AfterShip"
assert.Less(t, stringsSimilarity(s1, s2, 3), float32(0.8))
}
func TestSuggestDomainOK_HitExactDomain(t *testing.T) {
domain := "gmail.com"

21
util.go
View File

@ -67,24 +67,3 @@ func getMD5Hash(str string) (error, string) {
}
return nil, hex.EncodeToString(h.Sum(nil))
}
// equal compare two rune arrays and return if they are equals or not
func equal(a, b []rune) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}
// min return the smallest integer among the two in parameters
func min(a int, b int) int {
if b < a {
return b
}
return a
}

View File

@ -100,54 +100,3 @@ func TestSplitDomainSubDomain(t *testing.T) {
assert.Equal(t, sld, "aftership")
assert.Equal(t, tld, "com")
}
func TestEqualOK(t *testing.T) {
s := "aftership"
a := make([]rune, len(s))
b := make([]rune, len(s))
for i, v := range s {
a[i] = v
b[i] = v
}
assert.True(t, equal(a, b))
}
func TestEqualNotOK1(t *testing.T) {
s := "aftership"
a := make([]rune, len(s))
b := make([]rune, len(s))
for i, v := range s {
a[i] = v
}
assert.False(t, equal(a, b))
}
func TestEqualNotOK2(t *testing.T) {
s1 := "aftership"
s2 := "AfterShip"
a := make([]rune, len(s1))
b := make([]rune, len(s2))
for i, v := range s1 {
a[i] = v
}
for i, v := range s2 {
b[i] = v
}
assert.False(t, equal(a, b))
}
func TestMinOK(t *testing.T) {
a, b := 10, 9
assert.Equal(t, b, min(a, b))
}
func TestMinNotOK(t *testing.T) {
a, b := 10, 9
assert.NotEqual(t, a, min(a, b))
}