Source code for spellchecker.core

# -*- coding: utf-8 -*-
# Spellchecker
# Copyright 2008-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
import os
import string
import codecs
import inexactsearch
import urllib

from indexer import DictionaryIndex
from langdetect import _detect_lang

__all__ = ['Spellchecker', 'getInstance']


[docs]class Spellchecker:
    """
    Spellchecker Class. contains spell checking and suggestion methods.
    """

    def __init__(self):
        self.NWORDS = None
        self.lang = None
        self.dictionaries = {}

    def words(self, text):
        #for punct in string.punctuation:
        #    text = text.replace(punct,"")
        words = text.split()
        return set(words)

    def train(self, features=None):
        if not self.lang in self.dictionaries:
            index = DictionaryIndex()
            self.dictionaries[self.lang] = index.load_index(self.lang + ".dic")

    def get_wordlist(self, word=""):
        index = self.dictionaries.get(self.lang, None)
        if index is None:
            self.train()
            index = self.dictionaries.get(self.lang, None)

        words = []
        if word == "":
            return words

        byte_offset = index.get(word[0], None)
        if byte_offset is None:
            return words

        path = os.path.join(os.path.dirname(__file__), "dicts/" +
                            self.lang + ".dic")
        fp = codecs.open(path, "r", encoding="utf-8", errors="ignore")
        fp.seek(int(byte_offset))

        while True:
            line = fp.readline().strip()
            if len(line) > 0 and not word[0] == line[0]:
                break
            words.append(line)
        return words

    def levenshtein(self, s1, s2):
        if len(s1) < len(s2):
            return self.levenshtein(s2, s1)
        if not s1:
            return len(s2)

        previous_row = xrange(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        return previous_row[-1]

[docs]    def suggest(self, word, language=None, distance=2):
        """
        Gives a list of words similar to the given word

        :param word: The word for which  spelling suggestions are required.
        :type word: str.
        :param distance: suggestion will contain words with length =word length +/-  distance
        :type distance: int
        :returns: A list of suggested spellings.

         >>> a.suggest(u"cate")
         [u'cat', u'cater', u'caters', u'cats']

        """
        word = word.strip()
        if word == "":
            return None
        if self.lang != language:
            self.NWORDS = None
        if language is None:
            self.lang = _detect_lang(word)[word]
        else:
            self.lang = language
        if self.NWORDS is None:
            self.NWORDS = self.get_wordlist(word)
        if word in self.NWORDS:
            return word
        candidates = []
        for candidate in self.NWORDS:
            #skip if the first letter is different
            #if candidate[0] != word[0]:
            #    continue
            '''
            if the length difference is greater than
            the threshold distance, skip
            '''
            if len(candidate) - len(word) > distance \
                    or len(word) - len(candidate) > distance:
                continue
            if not self.levenshtein(candidate, word) > distance:
                candidates.append(candidate)
        candidates = self.filter_candidates(word, candidates)
        if len(candidates) == 0:
            '''
            try inserting spaces in between the letters
            to see if the word got merged
            '''
            pos = 2
            while pos < len(word) - 2:
                if self.check(word[:pos], self.lang) \
                        and self.check(word[pos:], self.lang):
                    candidates.append(word[:pos] + " " + word[pos:])
                    candidates.append(word[:pos] + "-" + word[pos:])
                pos += 1
        return candidates

    def filter_candidates(self, word, candidates):
        filtered_candidates = []
        isearch = inexactsearch.getInstance()
        #TODO sort by score
        for candidate in candidates:
            if isearch.compare(word, candidate) >= 0.6:
                # if both words sounds alike - almost
                filtered_candidates.append(candidate)
        return filtered_candidates

[docs]    def check(self, word, language=None):
        """
        Checks whether given word has correct spelling.

        :param word: The word whose spelling tis to be checked.
        :type word: str.
        :param language: *optional* language code for the word.
        :type languge: str.
        :returns: Boolean True or False

         >>> a.check(u"അംഗദന്‍")
         True
        """
        word = word.strip()
        if word == "":
            return None
        #If it is a number, don't do spelcheck
        if is_number(word):
            return True
        if self.lang != language:
            self.NWORDS = None
        if language is None:
            self.lang = _detect_lang(word)[word]
        else:
            self.lang = language
        if word == "":
            return True

        if self.NWORDS is None:
            self.NWORDS = self.get_wordlist(word)
        if self.NWORDS is None:
            # Dictionary not found
            return False
        result = word in self.NWORDS
        #if it is english word, try converting the first letter to lower case.
        #This will happen if the word is first word of a sentence
        if result is False and word.upper() != word.lower():
            newword = word[0].lower() + word[1:]
            self.NWORDS = self.get_wordlist(newword)
            return newword in self.NWORDS
        else:
            return result

    def strip_punctuations(self, s):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in s if ch not in exclude)

[docs]    def check_batch(self, text, language=None):
        """
        Return a list of misspelled words give a chunk of text.

        :param text: Input text.
        :type text: str
        :returns: list of mispelled words.

         >>> a.check_batch(u"thire is only one anser")
         [u'thire', u'anser']
        """
        words = urllib.unquote(text)
        words = words.split()
        misspelled_words = []
        for word in words:
            tempword = self.strip_punctuations(word)
            if not self.check(tempword, language):
                misspelled_words.append(word)
        return misspelled_words

[docs]    def get_module_name(self):
        """
        Returns module name.
        """
        return "Spellchecker"

[docs]    def get_info(self):
        """
        Returns module info
        """
        return "Indic Spellchecker"


def getInstance():
        return Spellchecker()


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
Source code for spellchecker.core

Related Topics

This Page

Navigation

Source code for spellchecker.core

Related Topics

This Page

Quick search