# -*- coding: UTF-8 -*-
# Copyright (C) 2004 Thierry Fromon <from.t@free.fr>
# Copyright (C) 2004, 2006-2007 Juan David Ibáñez Palomar <jdavid@itaapy.com>
# Copyright (C) 2008 Henry Obein <henry@itaapy.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

###########################################################################
# To add a new language, edit the dictionaries below:
#
#   - positive_chars
#
#     Defines special characters (like accentuated characters) that belong
#     to the language.
#
#   - negative_chars
#
#     Defines special characters (like accentuated characters) that do not
#     belong to the language.
#
#   - positive_words
#
#     Defines common words that belong to the language.
#
#   - negative_words
#
#     Defines some words that do not belong to the language.
###########################################################################

import unicodedata

positive_chars = {
    u'¡': ['es'],
    u'¿': ['es'],
    u'ä': ['de'],
    u'ß': ['de'],
    u'ç': ['fr'],
    u'ê': ['fr'],
    u'í': ['es'],
    u'ñ': ['es'],
    u'ö': ['de'],
    u'ó': ['es'],
    u'ü': ['de'],
    u'ú': ['es'],
    # Asian languages
    # Japanese : based on particles (hiragana)
    u'の': ['ja'],
    u'は': ['ja'],
    u'で': ['ja'],
    u'に': ['ja'],
    u'が': ['ja'],
    u'へ': ['ja'],
    u'を': ['ja'],
    u'や': ['ja'],
    u'と': ['ja'],
    # Japanese : punctuation
    u'、': ['ja'],
    u'。': ['ja'],
}

negative_chars = {}


positive_words = {
    u'à': ['fr'],
    u'al': ['es'],
    u'an': ['en'],
    u'and': ['en'],
    u'are': ['en'],
    u'as': ['en'],
    u'aux': ['fr'],
    u'but': ['en'],
    u'como': ['es'],
    u'con': ['es'],
    u'de': ['es', 'fr'],
    u'del': ['es'],
    u'des': ['fr'],
    u'donc': ['fr'],
    u'du': ['fr'],
    u'el': ['es'],
    u'elle': ['fr'],
    u'elles': ['fr'],
    u'es': ['es'],
    u'est': ['fr'],
    u'está': ['es'],
    u'et': ['fr'],
    u'from': ['en'],
    u'hay': ['es'],
    u'he': ['en', 'es'],
    u'i': ['en'],
    u'il': ['fr'],
    u'ils': ['fr'],
    u'in': ['en'],
    u'is': ['en'],
    u'it': ['en'],
    u'je': ['fr'],
    u'las': ['es'],
    u'le': ['es', 'fr'],
    u'lo': ['es'],
    u'les': ['es', 'fr'],
    u'los': ['es'],
    u'mais': ['fr'],
    u'no': ['en', 'es'],
    u'nous': ['fr'],
    u'nueva': ['es'],
    u'o': ['es'],
    u'of': ['en'],
    u'on': ['en'],
    u'or': ['en'],
    u'où': ['fr'],
    u'para': ['es'],
    u'pero': ['es'],
    u'por': ['es'],
    u'que': ['es', 'fr'],
    u'qué': ['es'],
    u'she': ['en'],
    u'su': ['es'],
    u'sur': ['fr'],
    u'that': ['en'],
    u'the': ['en'],
    u'their': ['en'],
    u'this': ['en'],
    u'to': ['en'],
    u'tu': ['es', 'fr'],
    u'un': ['es', 'fr'],
    u'una': ['es'],
    u'une': ['fr'],
    u'vous': ['fr'],
    u'when': ['en'],
    u'where': ['en'],
    u'y': ['es'],
    u'you': ['en'],
    u'your': ['en'],
}


negative_words = {
    u'du': ['es'],
}


# One thousand words should be enough
MAX_WORDS = 1000


def is_asian_character(c):

    # get the decimal value of the character
    code = int('%04x' % ord(c), 16)

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Compatibility
    # CJK Compatibility
    # Range: 3300–33FF
    if code >= 13056 and code <= 13311:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Compatibility_Forms
    # CJK Compatibility Forms
    # Range: FE30–FE4F
    if code >= 65072 and code <= 65103:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Compatibility_Ideographs
    # CJK Compatibility Ideographs
    # Range: F900–FAFF
    if code >= 63744 and code <= 64255:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Radicals_Supplement
    # CJK Radicals Supplement
    # Range: 2E80–2EFF
    if code >= 11904 and code <= 12031:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Strokes
    # CJK Strokes
    # Range: 31C0–31EF
    if code >= 12736 and code <= 12783:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Unified_Ideographs
    # CJK Unified Ideographs
    # Range: 4E00–9FBF
    if code >= 19968 and code <= 40895:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Unified_Ideographs_Extension_A
    # CJK Unified Ideographs Extension A
    # Range: 3400–4DBF
    if code >= 13312 and code <= 19903:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Halfwidth_and_Fullwidth_Forms
    # Halfwidth and Fullwidth Forms
    # Range: FF00–FFEF
    if code >= 65280 and code <= 65519 :
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Hangul_Compatibility_Jamo
    # Hangul Compatibility Jamo
    # Range: 3130–318F
    #if code >= 12592 and code <= 12687:
    #    return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Hangul_Jamo
    # Hangul Jamo
    # Range: 1100–11FF
    #if code >= 4352 and code <= 4607:
    #    return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Hangul_Syllables
    # Hangul Syllables
    # Range: AC00–D7AF
    #if code >= 44032 and code <= 55215:
    #    return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Hiragana
    # Hiragana
    # Range: 3040–309F
    if code >= 12352 and code <= 12447:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Katakana
    # Katakana
    # Range: 30A0–30FF
    if code >= 12448 and code <= 12543:
        return True

    # http://en.wikipedia.org/wiki/Template:Unicode_chart_Katakana_Phonetic_Extensions
    # Katakana Phonetic Extensions
    # Range: 31F0–31FF
    if code >= 12784 and code <= 12799:
        return True

    return False


def is_punctuation(c):
    """Check if c is a punctuation symbol
    http://en.wikipedia.org/wiki/Template:Unicode_chart_General_Punctuation
    General Punctuation
    Range: 2000–206F

    http://en.wikipedia.org/wiki/Template:Unicode_chart_CJK_Symbols_and_Punctuation
    CJK Symbols and Punctuation
    Range: 3000–303F
    """

    # call isalnum before check character code
    if c.isalnum() is False:
        return True

    # get the decimal value of the character
    code = int('%04x' % ord(c), 16)
    if (code >= 8192 and code <= 8303) or (code >= 12288 and code <= 12351):
        return True

    return False


###########################################################################
# The Code
###########################################################################
def guess_language(text):
    chars = {}
    words = {}

    # Number of chars and words analyzed
    n_chars = 0
    n_words = 0

    # Look for special chars and words in the given text
    word = u''
    for c in text:
        n_chars += 1
        c = c.lower()
        # Characters
        for language in positive_chars.get(c, []):
            chars.setdefault(language, 0)
            chars[language] += 1
        for language in negative_chars.get(c, []):
            chars.setdefault(language, 0)
            chars[language] -= 2
        # Words
        if c.isalpha():
            word += c
        elif word:
            for language in positive_words.get(word, []):
                words.setdefault(language, 0)
                words[language] += 1
            for language in negative_words.get(word, []):
                words.setdefault(language, 0)
                words[language] -= 2
            word = u''
            # Check limit
            n_words += 1
            if n_words >= MAX_WORDS:
                break

    # If we found nothing...
    if not chars and not words:
        return None

    # Depending on the length of the text, the weight given to chars and
    # words changes. The minimum distance between two languages too.
    if n_chars < 75:
        w_weight, c_weight, distance = 1.0, 1.0, 1.0
    elif n_chars < 500:
        w_weight, c_weight, distance = 1.2, 2.0, 2.0
    else:
        w_weight, c_weight, distance = 1.6, 4.0, 4.0

    # Calculate the chances the text is written in any language.
    languages = []
    for lang in set(chars.keys()) | set(words.keys()):
        p = w_weight*words.get(lang, 0) + c_weight*chars.get(lang, 0)
        languages.append((p, lang))
    languages.sort()

    # Pick the most probable language, unless the distance to the second is
    # too small.
    n = len(languages)
    if n == 0:
        return None
    if n == 1:
        return languages[0][1]
    if languages[-1][0] - languages[-2][0] >= distance:
        return languages[-1][1]

    return None