# -*- coding: utf-8 -*-
# Spellchecker
# Copyright 2008-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
import os
import string
import codecs
import inexactsearch
import urllib
from indexer import DictionaryIndex
from langdetect import _detect_lang
__all__ = ['Spellchecker', 'getInstance']
[docs]class Spellchecker:
"""
Spellchecker Class. contains spell checking and suggestion methods.
"""
def __init__(self):
self.NWORDS = None
self.lang = None
self.dictionaries = {}
def words(self, text):
#for punct in string.punctuation:
# text = text.replace(punct,"")
words = text.split()
return set(words)
def train(self, features=None):
if not self.lang in self.dictionaries:
index = DictionaryIndex()
self.dictionaries[self.lang] = index.load_index(self.lang + ".dic")
def get_wordlist(self, word=""):
index = self.dictionaries.get(self.lang, None)
if index is None:
self.train()
index = self.dictionaries.get(self.lang, None)
words = []
if word == "":
return words
byte_offset = index.get(word[0], None)
if byte_offset is None:
return words
path = os.path.join(os.path.dirname(__file__), "dicts/" +
self.lang + ".dic")
fp = codecs.open(path, "r", encoding="utf-8", errors="ignore")
fp.seek(int(byte_offset))
while True:
line = fp.readline().strip()
if len(line) > 0 and not word[0] == line[0]:
break
words.append(line)
return words
def levenshtein(self, s1, s2):
if len(s1) < len(s2):
return self.levenshtein(s2, s1)
if not s1:
return len(s2)
previous_row = xrange(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
[docs] def suggest(self, word, language=None, distance=2):
"""
Gives a list of words similar to the given word
:param word: The word for which spelling suggestions are required.
:type word: str.
:param distance: suggestion will contain words with length =word length +/- distance
:type distance: int
:returns: A list of suggested spellings.
>>> a.suggest(u"cate")
[u'cat', u'cater', u'caters', u'cats']
"""
word = word.strip()
if word == "":
return None
if self.lang != language:
self.NWORDS = None
if language is None:
self.lang = _detect_lang(word)[word]
else:
self.lang = language
if self.NWORDS is None:
self.NWORDS = self.get_wordlist(word)
if word in self.NWORDS:
return word
candidates = []
for candidate in self.NWORDS:
#skip if the first letter is different
#if candidate[0] != word[0]:
# continue
'''
if the length difference is greater than
the threshold distance, skip
'''
if len(candidate) - len(word) > distance \
or len(word) - len(candidate) > distance:
continue
if not self.levenshtein(candidate, word) > distance:
candidates.append(candidate)
candidates = self.filter_candidates(word, candidates)
if len(candidates) == 0:
'''
try inserting spaces in between the letters
to see if the word got merged
'''
pos = 2
while pos < len(word) - 2:
if self.check(word[:pos], self.lang) \
and self.check(word[pos:], self.lang):
candidates.append(word[:pos] + " " + word[pos:])
candidates.append(word[:pos] + "-" + word[pos:])
pos += 1
return candidates
def filter_candidates(self, word, candidates):
filtered_candidates = []
isearch = inexactsearch.getInstance()
#TODO sort by score
for candidate in candidates:
if isearch.compare(word, candidate) >= 0.6:
# if both words sounds alike - almost
filtered_candidates.append(candidate)
return filtered_candidates
[docs] def check(self, word, language=None):
"""
Checks whether given word has correct spelling.
:param word: The word whose spelling tis to be checked.
:type word: str.
:param language: *optional* language code for the word.
:type languge: str.
:returns: Boolean True or False
>>> a.check(u"അംഗദന്")
True
"""
word = word.strip()
if word == "":
return None
#If it is a number, don't do spelcheck
if is_number(word):
return True
if self.lang != language:
self.NWORDS = None
if language is None:
self.lang = _detect_lang(word)[word]
else:
self.lang = language
if word == "":
return True
if self.NWORDS is None:
self.NWORDS = self.get_wordlist(word)
if self.NWORDS is None:
# Dictionary not found
return False
result = word in self.NWORDS
#if it is english word, try converting the first letter to lower case.
#This will happen if the word is first word of a sentence
if result is False and word.upper() != word.lower():
newword = word[0].lower() + word[1:]
self.NWORDS = self.get_wordlist(newword)
return newword in self.NWORDS
else:
return result
def strip_punctuations(self, s):
exclude = set(string.punctuation)
return ''.join(ch for ch in s if ch not in exclude)
[docs] def check_batch(self, text, language=None):
"""
Return a list of misspelled words give a chunk of text.
:param text: Input text.
:type text: str
:returns: list of mispelled words.
>>> a.check_batch(u"thire is only one anser")
[u'thire', u'anser']
"""
words = urllib.unquote(text)
words = words.split()
misspelled_words = []
for word in words:
tempword = self.strip_punctuations(word)
if not self.check(tempword, language):
misspelled_words.append(word)
return misspelled_words
[docs] def get_module_name(self):
"""
Returns module name.
"""
return "Spellchecker"
[docs] def get_info(self):
"""
Returns module info
"""
return "Indic Spellchecker"
def getInstance():
return Spellchecker()
def is_number(s):
try:
float(s)
return True
except ValueError:
return False