Treat fullwidth space characters defined in Unicode as valid whitespace.

Patch by Manabu TERADA.
parent db3e078e
......@@ -94,6 +94,11 @@ _tokenizer_regex = re.compile(r"""
)
""", re.VERBOSE)
# Use unicode regex to treat fullwidth space characters defined in Unicode
# as valid whitespace.
_tokenizer_unicode_regex = re.compile(
_tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
class QueryParser:
implements(IQueryParser)
......@@ -109,7 +114,13 @@ class QueryParser:
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
try:
# Try to use unicode and treat fullwidth whitespace as valid one.
if not isinstance(query, unicode):
query = query.decode('utf-8')
tokens = _tokenizer_unicode_regex.findall(query)
except UnicodeDecodeError:
tokens = _tokenizer_regex.findall(query)
self._tokens = tokens
# classify tokens
self._tokentypes = [_keywords.get(token.upper(), _ATOM)
......
......@@ -210,6 +210,18 @@ class TestQueryParser(TestQueryParserBase):
self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")]))
def test024(self):
# Split by UTF-8 fullwidth space
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("foo\xe3\x80\x80bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
def test025(self):
# Split by Unicode fullwidth space
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect(u"foo\u3000bar", AndNode([AtomNode(u"foo"), AtomNode(u"bar")]))
def test101(self):
self.failure("")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment