mirror of
https://github.com/AfterShip/email-verifier.git
synced 2025-02-06 11:01:17 +00:00
MOD: use go-edlib pkg to calculate string distance
This commit is contained in:
parent
5cf81f1568
commit
d1d0ada367
@ -19,6 +19,9 @@ const (
|
||||
disposableDataURL = "https://raw.githubusercontent.com/disposable/disposable-email-domains/master/domains.json"
|
||||
|
||||
gravatarBaseUrl = "https://www.gravatar.com/avatar/"
|
||||
|
||||
gravatarDefaultMd5 = "d5fe5cbcc31cff5f8ac010db72eb000c"
|
||||
|
||||
domainThreshold float32 = 0.82
|
||||
secondLevelThreshold float32 = 0.82
|
||||
topLevelThreshold float32 = 0.6
|
||||
)
|
||||
|
1
go.mod
1
go.mod
@ -3,6 +3,7 @@ module github.com/AfterShip/email-verifier
|
||||
go 1.15
|
||||
|
||||
require (
|
||||
github.com/hbollon/go-edlib v1.3.3
|
||||
github.com/kr/pretty v0.2.1 // indirect
|
||||
github.com/stretchr/testify v1.7.0
|
||||
golang.org/x/net v0.0.0-20201207224615-747e23833adb
|
||||
|
2
go.sum
2
go.sum
@ -2,6 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542 h1:2VTzZjLZBgl62/EtslCrtky5vbi9dd7HrQPQIx6wqiw=
|
||||
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542/go.mod h1:Ow0tF8D4Kplbc8s8sSb3V2oUCygFHVp8gC3Dn6U4MNI=
|
||||
github.com/hbollon/go-edlib v1.3.3 h1:p4mih0w90lLkujQ5coYQ4X9DdE9NhTV/EtOSqdA2DKM=
|
||||
github.com/hbollon/go-edlib v1.3.3/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
|
||||
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
|
||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
|
@ -1,44 +0,0 @@
|
||||
package emailverifier
|
||||
|
||||
// levenshteinDistance calculate the distance between two string
|
||||
// Refer to the implementation of https://github.com/hbollon/go-edlib/blob/master/levenshtein.go
|
||||
func levenshteinDistance(str1, str2 string) int {
|
||||
// Convert string parameters to rune arrays to be compatible with non-ASCII
|
||||
runeStr1 := []rune(str1)
|
||||
runeStr2 := []rune(str2)
|
||||
|
||||
// Get and store length of these strings
|
||||
runeStr1len := len(runeStr1)
|
||||
runeStr2len := len(runeStr2)
|
||||
if runeStr1len == 0 {
|
||||
return runeStr2len
|
||||
} else if runeStr2len == 0 {
|
||||
return runeStr1len
|
||||
} else if equal(runeStr1, runeStr2) {
|
||||
return 0
|
||||
}
|
||||
|
||||
column := make([]int, runeStr1len+1)
|
||||
|
||||
for y := 1; y <= runeStr1len; y++ {
|
||||
column[y] = y
|
||||
}
|
||||
for x := 1; x <= runeStr2len; x++ {
|
||||
column[0] = x
|
||||
lastkey := x - 1
|
||||
for y := 1; y <= runeStr1len; y++ {
|
||||
oldkey := column[y]
|
||||
var i int
|
||||
if runeStr1[y-1] != runeStr2[x-1] {
|
||||
i = 1
|
||||
}
|
||||
column[y] = min(
|
||||
min(column[y]+1, // insert
|
||||
column[y-1]+1), // delete
|
||||
lastkey+i) // substitution
|
||||
lastkey = oldkey
|
||||
}
|
||||
}
|
||||
|
||||
return column[runeStr1len]
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
package emailverifier
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestLevenshteinDistanceOK1(t *testing.T) {
|
||||
s1, s2 := "gmail.com", "gmaii.com"
|
||||
assert.Equal(t, levenshteinDistance(s1, s2), 1)
|
||||
}
|
||||
|
||||
func TestLevenshteinDistanceOK2(t *testing.T) {
|
||||
s1, s2 := "gmail.com", "gmai.com"
|
||||
assert.Equal(t, levenshteinDistance(s1, s2), 1)
|
||||
}
|
||||
|
||||
func TestLevenshteinDistanceOK3(t *testing.T) {
|
||||
s1, s2 := "", "abcde"
|
||||
assert.Equal(t, levenshteinDistance(s1, s2), 5)
|
||||
}
|
||||
|
||||
func TestLevenshteinDistanceOK4(t *testing.T) {
|
||||
s1, s2 := "abcde", "abcde"
|
||||
assert.Equal(t, levenshteinDistance(s1, s2), 0)
|
||||
}
|
||||
|
||||
func TestLevenshteinDistanceOK5(t *testing.T) {
|
||||
s1, s2 := "distance", "difference"
|
||||
assert.Equal(t, levenshteinDistance(s1, s2), 5)
|
||||
}
|
@ -2,12 +2,8 @@ package emailverifier
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
domainThreshold float32 = 0.82
|
||||
secondLevelThreshold float32 = 0.82
|
||||
topLevelThreshold float32 = 0.6
|
||||
"github.com/hbollon/go-edlib"
|
||||
)
|
||||
|
||||
// SuggestDomain checks if domain has a typo and suggests a similar correct domain from metadata,
|
||||
@ -74,7 +70,7 @@ func findClosestDomain(domain string, domains map[string]bool, threshold float32
|
||||
return domain
|
||||
}
|
||||
|
||||
dist := stringsSimilarity(domain, d, levenshteinDistance(domain, d))
|
||||
dist, _ := edlib.StringsSimilarity(domain, d, edlib.Levenshtein)
|
||||
if dist > maxDist {
|
||||
maxDist = dist
|
||||
closestDomain = d
|
||||
@ -87,12 +83,3 @@ func findClosestDomain(domain string, domains map[string]bool, threshold float32
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// stringsSimilarity returns a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
|
||||
func stringsSimilarity(str1 string, str2 string, distance int) float32 {
|
||||
// Compare strings length and make a matching percentage between them
|
||||
if len(str1) >= len(str2) {
|
||||
return float32(len(str1)-distance) / float32(len(str1))
|
||||
}
|
||||
return float32(len(str2)-distance) / float32(len(str2))
|
||||
}
|
||||
|
@ -6,16 +6,6 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestStringsSimilaritystr1Longer(t *testing.T) {
|
||||
s1, s2 := "Automizely", "AfterShip"
|
||||
assert.Greater(t, stringsSimilarity(s1, s2, 3), float32(0.5))
|
||||
}
|
||||
|
||||
func TestStringsSimilaritystr2Longer(t *testing.T) {
|
||||
s2, s1 := "Automizely", "AfterShip"
|
||||
assert.Less(t, stringsSimilarity(s1, s2, 3), float32(0.8))
|
||||
}
|
||||
|
||||
func TestSuggestDomainOK_HitExactDomain(t *testing.T) {
|
||||
domain := "gmail.com"
|
||||
|
||||
|
21
util.go
21
util.go
@ -67,24 +67,3 @@ func getMD5Hash(str string) (error, string) {
|
||||
}
|
||||
return nil, hex.EncodeToString(h.Sum(nil))
|
||||
}
|
||||
|
||||
// equal compare two rune arrays and return if they are equals or not
|
||||
func equal(a, b []rune) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i, v := range a {
|
||||
if v != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// min return the smallest integer among the two in parameters
|
||||
func min(a int, b int) int {
|
||||
if b < a {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
51
util_test.go
51
util_test.go
@ -100,54 +100,3 @@ func TestSplitDomainSubDomain(t *testing.T) {
|
||||
assert.Equal(t, sld, "aftership")
|
||||
assert.Equal(t, tld, "com")
|
||||
}
|
||||
|
||||
func TestEqualOK(t *testing.T) {
|
||||
s := "aftership"
|
||||
|
||||
a := make([]rune, len(s))
|
||||
b := make([]rune, len(s))
|
||||
for i, v := range s {
|
||||
a[i] = v
|
||||
b[i] = v
|
||||
}
|
||||
assert.True(t, equal(a, b))
|
||||
|
||||
}
|
||||
|
||||
func TestEqualNotOK1(t *testing.T) {
|
||||
s := "aftership"
|
||||
|
||||
a := make([]rune, len(s))
|
||||
b := make([]rune, len(s))
|
||||
for i, v := range s {
|
||||
a[i] = v
|
||||
}
|
||||
assert.False(t, equal(a, b))
|
||||
}
|
||||
|
||||
func TestEqualNotOK2(t *testing.T) {
|
||||
s1 := "aftership"
|
||||
s2 := "AfterShip"
|
||||
|
||||
a := make([]rune, len(s1))
|
||||
b := make([]rune, len(s2))
|
||||
for i, v := range s1 {
|
||||
a[i] = v
|
||||
}
|
||||
|
||||
for i, v := range s2 {
|
||||
b[i] = v
|
||||
}
|
||||
assert.False(t, equal(a, b))
|
||||
|
||||
}
|
||||
|
||||
func TestMinOK(t *testing.T) {
|
||||
a, b := 10, 9
|
||||
assert.Equal(t, b, min(a, b))
|
||||
}
|
||||
|
||||
func TestMinNotOK(t *testing.T) {
|
||||
a, b := 10, 9
|
||||
assert.NotEqual(t, a, min(a, b))
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user