Commit 13bbc206 authored by Stefan Behnel's avatar Stefan Behnel

ignore UTF-8 BOMs at the beginning of source files

parent fe78837b
...@@ -215,6 +215,17 @@ def detect_opened_file_encoding(f): ...@@ -215,6 +215,17 @@ def detect_opened_file_encoding(f):
return encoding.group(1) return encoding.group(1)
return "UTF-8" return "UTF-8"
def skip_bom(f):
"""
Read past a BOM at the beginning of a source file.
This could be added to the scanner, but it's *substantially* easier
to keep it at this level.
"""
if f.read(1) != u'\uFEFF':
f.seek(0)
normalise_newlines = re.compile(u'\r\n?|\n').sub normalise_newlines = re.compile(u'\r\n?|\n').sub
...@@ -264,6 +275,7 @@ if sys.version_info >= (2,6): ...@@ -264,6 +275,7 @@ if sys.version_info >= (2,6):
except ImportError: except ImportError:
pass pass
def open_source_file(source_filename, mode="r", def open_source_file(source_filename, mode="r",
encoding=None, error_handling=None, encoding=None, error_handling=None,
require_normalised_newlines=True): require_normalised_newlines=True):
...@@ -272,8 +284,11 @@ def open_source_file(source_filename, mode="r", ...@@ -272,8 +284,11 @@ def open_source_file(source_filename, mode="r",
# it's UTF-8. # it's UTF-8.
f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore') f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore')
encoding = detect_opened_file_encoding(f) encoding = detect_opened_file_encoding(f)
if encoding == "UTF-8" and error_handling=='ignore' and require_normalised_newlines: if (encoding == "UTF-8"
and error_handling == 'ignore'
and require_normalised_newlines):
f.seek(0) f.seek(0)
skip_bom(f)
return f return f
else: else:
f.close() f.close()
...@@ -290,7 +305,7 @@ def open_source_file(source_filename, mode="r", ...@@ -290,7 +305,7 @@ def open_source_file(source_filename, mode="r",
pass pass
# #
if io is not None: if io is not None:
return io.open(source_filename, mode=mode, stream = io.open(source_filename, mode=mode,
encoding=encoding, errors=error_handling) encoding=encoding, errors=error_handling)
else: else:
# codecs module doesn't have universal newline support # codecs module doesn't have universal newline support
...@@ -298,8 +313,10 @@ def open_source_file(source_filename, mode="r", ...@@ -298,8 +313,10 @@ def open_source_file(source_filename, mode="r",
encoding=encoding, errors=error_handling) encoding=encoding, errors=error_handling)
if require_normalised_newlines: if require_normalised_newlines:
stream = NormalisedNewlineStream(stream) stream = NormalisedNewlineStream(stream)
skip_bom(stream)
return stream return stream
def open_source_from_loader(loader, def open_source_from_loader(loader,
source_filename, source_filename,
encoding=None, error_handling=None, encoding=None, error_handling=None,
......
...@@ -277,6 +277,9 @@ TEST_SUPPORT_DIR = 'testsupport' ...@@ -277,6 +277,9 @@ TEST_SUPPORT_DIR = 'testsupport'
BACKENDS = ['c', 'cpp'] BACKENDS = ['c', 'cpp']
UTF8_BOM_BYTES = r'\xef\xbb\xbf'.encode('ISO-8859-1').decode('unicode_escape')
def memoize(f): def memoize(f):
uncomputed = object() uncomputed = object()
f._cache = {} f._cache = {}
...@@ -287,13 +290,15 @@ def memoize(f): ...@@ -287,13 +290,15 @@ def memoize(f):
return res return res
return func return func
@memoize @memoize
def parse_tags(filepath): def parse_tags(filepath):
tags = defaultdict(list) tags = defaultdict(list)
f = io_open(filepath, encoding='ISO-8859-1', errors='replace') f = io_open(filepath, encoding='ISO-8859-1', errors='ignore')
try: try:
for line in f: for line in f:
line = line.strip() # ignore BOM-like bytes and whitespace
line = line.lstrip(UTF8_BOM_BYTES).strip()
if not line: if not line:
continue continue
if line[0] != '#': if line[0] != '#':
......
# coding: utf-8
# mode: compile
# this file starts with a UTF-8 encoded BOM
# the only thing we test is that it properly compiles
def test():
pass
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment