D
dataangel
I was just curious if there were any spell checker python modules around
that can guess at what the user meant to type in. I wrote up a quick
function that splices a string up into bigrams and then checks how many
bigrams are identical to a given word, which I think is how google does
it. But support for trigrams etc. could be added, so I'm curious if
anyone out there has done something more. Here's the script:
def StringsSimilarity(str1, str2):
"""Divides the two strings into bigrams and reports
what percentage of them are equal"""
str1 = str1.strip().lower()
str2 = str2.strip().lower()
bigramStr1 = []
bigramStr2 = []
currentList = bigramStr1
i = 0
j = 0
# Empty versus non empty strings are never similar
if not (str1 and str2):
return 0
# 100% match if equal
if str1 == str2:
return 1.0
# Make strings equal length, simplifies things
len1 = len(str1)
len2 = len(str2)
if len1 > len2:
str2 = str2 + " "*(len1-len2)
elif len2 > len1:
str1 = str1 + " "*(len2-len1)
len1 = len(str1)
len2 = len(str2)
currentString = str1
# Generate bigrams
while j < 2:
i = 0
while i < len1:
if i+1 >= len1:
currentList.append(currentString)
else:
currentList.append(currentString + currentString[i+1])
i += 2
j += 1
currentList = bigramStr2
currentString = str2
similarity = 0
for i in range(len(bigramStr1)):
if bigramStr1 == bigramStr2:
similarity += 1.0
if similarity == 0:
return 0
return similarity/len(bigramStr1)
def StringsSimilar(str1, str2):
"""Using StringSimilarity, decides if the two
strings score is good enough, 50%, to be
considered similar"""
return StringsSimilarity(str1, str2) >= 0.50
that can guess at what the user meant to type in. I wrote up a quick
function that splices a string up into bigrams and then checks how many
bigrams are identical to a given word, which I think is how google does
it. But support for trigrams etc. could be added, so I'm curious if
anyone out there has done something more. Here's the script:
def StringsSimilarity(str1, str2):
"""Divides the two strings into bigrams and reports
what percentage of them are equal"""
str1 = str1.strip().lower()
str2 = str2.strip().lower()
bigramStr1 = []
bigramStr2 = []
currentList = bigramStr1
i = 0
j = 0
# Empty versus non empty strings are never similar
if not (str1 and str2):
return 0
# 100% match if equal
if str1 == str2:
return 1.0
# Make strings equal length, simplifies things
len1 = len(str1)
len2 = len(str2)
if len1 > len2:
str2 = str2 + " "*(len1-len2)
elif len2 > len1:
str1 = str1 + " "*(len2-len1)
len1 = len(str1)
len2 = len(str2)
currentString = str1
# Generate bigrams
while j < 2:
i = 0
while i < len1:
if i+1 >= len1:
currentList.append(currentString)
else:
currentList.append(currentString + currentString[i+1])
i += 2
j += 1
currentList = bigramStr2
currentString = str2
similarity = 0
for i in range(len(bigramStr1)):
if bigramStr1 == bigramStr2:
similarity += 1.0
if similarity == 0:
return 0
return similarity/len(bigramStr1)
def StringsSimilar(str1, str2):
"""Using StringSimilarity, decides if the two
strings score is good enough, 50%, to be
considered similar"""
return StringsSimilarity(str1, str2) >= 0.50