Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically detect character encoding of YAML files and ignore files #630

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ jobs:
- run: pip install .
# https://github.com/AndreMiras/coveralls-python-action/issues/18
- run: echo -e "[run]\nrelative_files = True" > .coveragerc
- run: coverage run -m unittest discover
- run: >-
python
-X warn_default_encoding
-W error::EncodingWarning
-m coverage
run
-m unittest
discover
- name: Coveralls
uses: AndreMiras/coveralls-python-action@develop
4 changes: 4 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ or:

.. note:: However, this is mutually exclusive with the ``ignore`` key.

.. note:: Files on the ``ignore-from-file`` list must use either UTF-8, UTF-16
or UTF-32. Additionally, they must start with either an ASCII character or a
byte order mark.

If you need to know the exact list of files that yamllint would process,
without really linting them, you can use ``--list-files``:

Expand Down
183 changes: 149 additions & 34 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,169 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import codecs
import contextlib
from io import StringIO
import os
import shutil
import sys
import tempfile
import unittest
import warnings
from codecs import CodecInfo

import yaml

from yamllint import linter
from yamllint.config import YamlLintConfig


# Encoding related stuff:
UTF_CODECS = (
'utf_32_be',
'utf_32_be_sig',
'utf_32_le',
'utf_32_le_sig',
'utf_16_be',
'utf_16_be_sig',
'utf_16_le',
'utf_16_le_sig',
'utf_8',
'utf_8_sig'
)


def encode_utf_32_be_sig(obj, errors='strict'):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like errors='strict' is already the default: https://docs.python.org/3/library/codecs.html#codecs.encode and no yamllint code uses this argument anywhere.
But I'm OK to keep it if you think it's better to be explicit.

return (
codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
len(obj)
)


def encode_utf_32_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
len(obj)
)


def encode_utf_16_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
len(obj)
)


def encode_utf_16_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
len(obj)
)


test_codec_infos = {
'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501
'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501
'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501
'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501
}
Comment on lines +76 to +81
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand these # noqa originate from #630 (comment), but instead of overriding the linter, can you use this:

test_codec_infos = {
    'utf_32_be_sig':
    CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),

or this:

test_codec_infos = {
    'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig,
                               codecs.getdecoder('utf_32')),

?



def register_test_codecs():
codecs.register(test_codec_infos.get)


def unregister_test_codecs():
if sys.version_info >= (3, 10, 0):
codecs.unregister(test_codec_infos.get)
else:
warnings.warn(
"This version of Python doesn’t allow us to unregister codecs.",
stacklevel=1
)


def is_test_codec(codec):
return codec in test_codec_infos.keys()


def test_codec_built_in_equivalent(test_codec):
return_value = test_codec
for suffix in ('_sig', '_be', '_le'):
return_value = return_value.replace(suffix, '')
return return_value


def uses_bom(codec):
for suffix in ('_32', '_16', '_sig'):
if codec.endswith(suffix):
return True
return False


def encoding_detectable(string, codec):
"""
Returns True if encoding can be detected after string is encoded

Encoding detection only works if you’re using a BOM or the first character
is ASCII. See yamllint.decoder.auto_decode()’s docstring.
"""
return uses_bom(codec) or (len(string) > 0 and string[0].isascii())


# Workspace related stuff:
class Blob:
def __init__(self, text, encoding):
self.text = text
self.encoding = encoding


def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')

for path, content in files.items():
path = os.fsencode(os.path.join(tempdir, path))
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))

if isinstance(content, list):
os.mkdir(path)
elif isinstance(content, str) and content.startswith('symlink://'):
os.symlink(content[10:], path)
else:
if isinstance(content, Blob):
content = content.text.encode(content.encoding)
elif isinstance(content, str):
content = content.encode('utf_8')
with open(path, 'wb') as f:
f.write(content)

return tempdir


@contextlib.contextmanager
def temp_workspace(files):
"""Provide a temporary workspace that is automatically cleaned up."""
backup_wd = os.getcwd()
wd = build_temp_workspace(files)

try:
os.chdir(wd)
yield
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)


def temp_workspace_with_files_in_many_codecs(path_template, text):
workspace = {}
for codec in UTF_CODECS:
if encoding_detectable(text, codec):
workspace[path_template.format(codec)] = Blob(text, codec)
return workspace


# Miscellaneous stuff:
class RuleTestCase(unittest.TestCase):
def build_fake_config(self, conf):
if conf is None:
Expand Down Expand Up @@ -81,37 +230,3 @@ def __exit__(self, *exc_info):
@property
def returncode(self):
return self._raises_ctx.exception.code


def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')

for path, content in files.items():
path = os.path.join(tempdir, path).encode('utf-8')
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))

if isinstance(content, list):
os.mkdir(path)
elif isinstance(content, str) and content.startswith('symlink://'):
os.symlink(content[10:], path)
else:
mode = 'wb' if isinstance(content, bytes) else 'w'
with open(path, mode) as f:
f.write(content)

return tempdir


@contextlib.contextmanager
def temp_workspace(files):
"""Provide a temporary workspace that is automatically cleaned up."""
backup_wd = os.getcwd()
wd = build_temp_workspace(files)

try:
os.chdir(wd)
yield
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)
83 changes: 73 additions & 10 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@
import unittest
from io import StringIO

from tests.common import build_temp_workspace, RunContext, temp_workspace
from tests.common import (
build_temp_workspace,
register_test_codecs,
RunContext,
temp_workspace,
unregister_test_codecs,
temp_workspace_with_files_in_many_codecs,
)

from yamllint import cli, config

Expand Down Expand Up @@ -296,14 +303,14 @@ def test_run_with_implicit_extends_config(self):
(ctx.returncode, ctx.stdout, ctx.stderr), (0, expected_out, ''))

def test_run_with_config_file(self):
with open(os.path.join(self.wd, 'config'), 'w') as f:
with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: disable}')

with RunContext(self) as ctx:
cli.run(('-c', f.name, os.path.join(self.wd, 'a.yaml')))
self.assertEqual(ctx.returncode, 0)

with open(os.path.join(self.wd, 'config'), 'w') as f:
with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: enable}')

with RunContext(self) as ctx:
Expand All @@ -319,14 +326,14 @@ def test_run_with_user_global_config_file(self):
self.addCleanup(os.environ.__delitem__, 'HOME')
os.environ['HOME'] = home

with open(config, 'w') as f:
with open(config, 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: disable}')

with RunContext(self) as ctx:
cli.run((os.path.join(self.wd, 'a.yaml'), ))
self.assertEqual(ctx.returncode, 0)

with open(config, 'w') as f:
with open(config, 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: enable}')

with RunContext(self) as ctx:
Expand All @@ -339,7 +346,8 @@ def test_run_with_user_xdg_config_home_in_env(self):
with tempfile.TemporaryDirectory('w') as d:
os.environ['XDG_CONFIG_HOME'] = d
os.makedirs(os.path.join(d, 'yamllint'))
with open(os.path.join(d, 'yamllint', 'config'), 'w') as f:
path = os.path.join(d, 'yamllint', 'config')
with open(path, 'w', encoding='utf_8') as f:
f.write('extends: relaxed')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', os.path.join(self.wd, 'warn.yaml')))
Expand All @@ -349,15 +357,15 @@ def test_run_with_user_xdg_config_home_in_env(self):
def test_run_with_user_yamllint_config_file_in_env(self):
self.addCleanup(os.environ.__delitem__, 'YAMLLINT_CONFIG_FILE')

with tempfile.NamedTemporaryFile('w') as f:
with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
os.environ['YAMLLINT_CONFIG_FILE'] = f.name
f.write('rules: {trailing-spaces: disable}')
f.flush()
with RunContext(self) as ctx:
cli.run((os.path.join(self.wd, 'a.yaml'), ))
self.assertEqual(ctx.returncode, 0)

with tempfile.NamedTemporaryFile('w') as f:
with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
os.environ['YAMLLINT_CONFIG_FILE'] = f.name
f.write('rules: {trailing-spaces: enable}')
f.flush()
Expand Down Expand Up @@ -499,8 +507,13 @@ def test_run_default_format_output_in_tty(self):
path = os.path.join(self.wd, 'a.yaml')

# Create a pseudo-TTY and redirect stdout to it
old_stdout, old_stderr = sys.stdout, sys.stderr
master, slave = pty.openpty()
sys.stdout = sys.stderr = os.fdopen(slave, 'w')
sys.stdout = sys.stderr = os.fdopen(
slave,
'w',
encoding=os.device_encoding(slave)
)

with self.assertRaises(SystemExit) as ctx:
cli.run((path, ))
Expand All @@ -509,7 +522,7 @@ def test_run_default_format_output_in_tty(self):
self.assertEqual(ctx.exception.code, 1)

# Read output from TTY
output = os.fdopen(master, 'r')
output = os.fdopen(master, 'r', encoding=os.device_encoding(master))
flag = fcntl.fcntl(master, fcntl.F_GETFD)
fcntl.fcntl(master, fcntl.F_SETFL, flag | os.O_NONBLOCK)

Expand All @@ -518,6 +531,7 @@ def test_run_default_format_output_in_tty(self):
sys.stdout.close()
sys.stderr.close()
output.close()
sys.stdout, sys.stderr = old_stdout, old_stderr

self.assertEqual(out, (
f'\033[4m{path}\033[0m\n'
Expand Down Expand Up @@ -817,3 +831,52 @@ def test_multiple_parent_config_file(self):
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
(0, './4spaces.yml:2:5: [warning] wrong indentation: '
'expected 3 but found 4 (indentation)\n', ''))


class CommandLineEncodingTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
register_test_codecs()

@classmethod
def tearDownClass(cls):
super().tearDownClass()
unregister_test_codecs()

def test_valid_encodings(self):
conf = ('---\n'
'rules:\n'
' key-ordering: enable\n')
config_files = temp_workspace_with_files_in_many_codecs(
'config_{}.yaml',
conf
)
sorted_correctly = ('---\n'
'A: YAML\n'
'Z: YAML\n')
sorted_correctly_files = temp_workspace_with_files_in_many_codecs(
'sorted_correctly/{}.yaml',
sorted_correctly
)
sorted_incorrectly = ('---\n'
'Z: YAML\n'
'A: YAML\n')
sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs(
'sorted_incorrectly/{}.yaml',
sorted_incorrectly
)
workspace = {
**config_files,
**sorted_correctly_files,
**sorted_incorrectly_files
}

with temp_workspace(workspace):
for config_path in config_files.keys():
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_correctly/'))
self.assertEqual(ctx.returncode, 0)
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_incorrectly/'))
self.assertNotEqual(ctx.returncode, 0)
Loading
Loading