adrienverge · Jayman2000 · Jan 2, 2024 · Jan 3, 2024 · Dec 30, 2023 · Dec 31, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -57,6 +57,13 @@ jobs:
       - run: pip install .
       # https://github.com/AndreMiras/coveralls-python-action/issues/18
       - run: echo -e "[run]\nrelative_files = True" > .coveragerc
-      - run: coverage run -m unittest discover
+      - run: >-
+          python
+          -X warn_default_encoding
+          -W error::EncodingWarning
+          -m coverage
+          run
+          -m unittest
+          discover
       - name: Coveralls
         uses: AndreMiras/coveralls-python-action@develop
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -228,6 +228,10 @@ or:
 
 .. note:: However, this is mutually exclusive with the ``ignore`` key.
 
+.. note:: Files on the ``ignore-from-file`` list must use either UTF-8, UTF-16
+   or UTF-32. Additionally, they must start with either an ASCII character or a
+   byte order mark.
+
 If you need to know the exact list of files that yamllint would process,
 without really linting them, you can use ``--list-files``:
 

diff --git a/tests/common.py b/tests/common.py
@@ -13,20 +13,169 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+import codecs
 import contextlib
 from io import StringIO
 import os
 import shutil
 import sys
 import tempfile
 import unittest
+import warnings
+from codecs import CodecInfo
 
 import yaml
 
 from yamllint import linter
 from yamllint.config import YamlLintConfig
 
 
+# Encoding related stuff:
+UTF_CODECS = (
+    'utf_32_be',
+    'utf_32_be_sig',
+    'utf_32_le',
+    'utf_32_le_sig',
+    'utf_16_be',
+    'utf_16_be_sig',
+    'utf_16_le',
+    'utf_16_le_sig',
+    'utf_8',
+    'utf_8_sig'
+)
+
+
+def encode_utf_32_be_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
+        len(obj)
+    )
+
+
+def encode_utf_32_le_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
+        len(obj)
+    )
+
+
+def encode_utf_16_be_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
+        len(obj)
+    )
+
+
+def encode_utf_16_le_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
+        len(obj)
+    )
+
+
+test_codec_infos = {
+    'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),  # noqa: E501
+    'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')),  # noqa: E501
+    'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')),  # noqa: E501
+    'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')),  # noqa: E501
+}
+
+
+def register_test_codecs():
+    codecs.register(test_codec_infos.get)
+
+
+def unregister_test_codecs():
+    if sys.version_info >= (3, 10, 0):
+        codecs.unregister(test_codec_infos.get)
+    else:
+        warnings.warn(
+            "This version of Python doesn’t allow us to unregister codecs.",
+            stacklevel=1
+        )
+
+
+def is_test_codec(codec):
+    return codec in test_codec_infos.keys()
+
+
+def test_codec_built_in_equivalent(test_codec):
+    return_value = test_codec
+    for suffix in ('_sig', '_be', '_le'):
+        return_value = return_value.replace(suffix, '')
+    return return_value
+
+
+def uses_bom(codec):
+    for suffix in ('_32', '_16', '_sig'):
+        if codec.endswith(suffix):
+            return True
+    return False
+
+
+def encoding_detectable(string, codec):
+    """
+    Returns True if encoding can be detected after string is encoded
+
+    Encoding detection only works if you’re using a BOM or the first character
+    is ASCII. See yamllint.decoder.auto_decode()’s docstring.
+    """
+    return uses_bom(codec) or (len(string) > 0 and string[0].isascii())
+
+
+# Workspace related stuff:
+class Blob:
+    def __init__(self, text, encoding):
+        self.text = text
+        self.encoding = encoding
+
+
+def build_temp_workspace(files):
+    tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
+
+    for path, content in files.items():
+        path = os.fsencode(os.path.join(tempdir, path))
+        if not os.path.exists(os.path.dirname(path)):
+            os.makedirs(os.path.dirname(path))
+
+        if isinstance(content, list):
+            os.mkdir(path)
+        elif isinstance(content, str) and content.startswith('symlink://'):
+            os.symlink(content[10:], path)
+        else:
+            if isinstance(content, Blob):
+                content = content.text.encode(content.encoding)
+            elif isinstance(content, str):
+                content = content.encode('utf_8')
+            with open(path, 'wb') as f:
+                f.write(content)
+
+    return tempdir
+
+
+@contextlib.contextmanager
+def temp_workspace(files):
+    """Provide a temporary workspace that is automatically cleaned up."""
+    backup_wd = os.getcwd()
+    wd = build_temp_workspace(files)
+
+    try:
+        os.chdir(wd)
+        yield
+    finally:
+        os.chdir(backup_wd)
+        shutil.rmtree(wd)
+
+
+def temp_workspace_with_files_in_many_codecs(path_template, text):
+    workspace = {}
+    for codec in UTF_CODECS:
+        if encoding_detectable(text, codec):
+            workspace[path_template.format(codec)] = Blob(text, codec)
+    return workspace
+
+
+# Miscellaneous stuff:
 class RuleTestCase(unittest.TestCase):
     def build_fake_config(self, conf):
         if conf is None:
@@ -81,37 +230,3 @@ def __exit__(self, *exc_info):
     @property
     def returncode(self):
         return self._raises_ctx.exception.code
-
-
-def build_temp_workspace(files):
-    tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
-
-    for path, content in files.items():
-        path = os.path.join(tempdir, path).encode('utf-8')
-        if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
-
-        if isinstance(content, list):
-            os.mkdir(path)
-        elif isinstance(content, str) and content.startswith('symlink://'):
-            os.symlink(content[10:], path)
-        else:
-            mode = 'wb' if isinstance(content, bytes) else 'w'
-            with open(path, mode) as f:
-                f.write(content)
-
-    return tempdir
-
-
-@contextlib.contextmanager
-def temp_workspace(files):
-    """Provide a temporary workspace that is automatically cleaned up."""
-    backup_wd = os.getcwd()
-    wd = build_temp_workspace(files)
-
-    try:
-        os.chdir(wd)
-        yield
-    finally:
-        os.chdir(backup_wd)
-        shutil.rmtree(wd)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -23,7 +23,14 @@
 import unittest
 from io import StringIO
 
-from tests.common import build_temp_workspace, RunContext, temp_workspace
+from tests.common import (
+    build_temp_workspace,
+    register_test_codecs,
+    RunContext,
+    temp_workspace,
+    unregister_test_codecs,
+    temp_workspace_with_files_in_many_codecs,
+)
 
 from yamllint import cli, config
 
@@ -296,14 +303,14 @@ def test_run_with_implicit_extends_config(self):
             (ctx.returncode, ctx.stdout, ctx.stderr), (0, expected_out, ''))
 
     def test_run_with_config_file(self):
-        with open(os.path.join(self.wd, 'config'), 'w') as f:
+        with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f:
             f.write('rules: {trailing-spaces: disable}')
 
         with RunContext(self) as ctx:
             cli.run(('-c', f.name, os.path.join(self.wd, 'a.yaml')))
         self.assertEqual(ctx.returncode, 0)
 
-        with open(os.path.join(self.wd, 'config'), 'w') as f:
+        with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f:
             f.write('rules: {trailing-spaces: enable}')
 
         with RunContext(self) as ctx:
@@ -319,14 +326,14 @@ def test_run_with_user_global_config_file(self):
         self.addCleanup(os.environ.__delitem__, 'HOME')
         os.environ['HOME'] = home
 
-        with open(config, 'w') as f:
+        with open(config, 'w', encoding='utf_8') as f:
             f.write('rules: {trailing-spaces: disable}')
 
         with RunContext(self) as ctx:
             cli.run((os.path.join(self.wd, 'a.yaml'), ))
         self.assertEqual(ctx.returncode, 0)
 
-        with open(config, 'w') as f:
+        with open(config, 'w', encoding='utf_8') as f:
             f.write('rules: {trailing-spaces: enable}')
 
         with RunContext(self) as ctx:
@@ -339,7 +346,8 @@ def test_run_with_user_xdg_config_home_in_env(self):
         with tempfile.TemporaryDirectory('w') as d:
             os.environ['XDG_CONFIG_HOME'] = d
             os.makedirs(os.path.join(d, 'yamllint'))
-            with open(os.path.join(d, 'yamllint', 'config'), 'w') as f:
+            path = os.path.join(d, 'yamllint', 'config')
+            with open(path, 'w', encoding='utf_8') as f:
                 f.write('extends: relaxed')
             with RunContext(self) as ctx:
                 cli.run(('-f', 'parsable', os.path.join(self.wd, 'warn.yaml')))
@@ -349,15 +357,15 @@ def test_run_with_user_xdg_config_home_in_env(self):
     def test_run_with_user_yamllint_config_file_in_env(self):
         self.addCleanup(os.environ.__delitem__, 'YAMLLINT_CONFIG_FILE')
 
-        with tempfile.NamedTemporaryFile('w') as f:
+        with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
             os.environ['YAMLLINT_CONFIG_FILE'] = f.name
             f.write('rules: {trailing-spaces: disable}')
             f.flush()
             with RunContext(self) as ctx:
                 cli.run((os.path.join(self.wd, 'a.yaml'), ))
             self.assertEqual(ctx.returncode, 0)
 
-        with tempfile.NamedTemporaryFile('w') as f:
+        with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
             os.environ['YAMLLINT_CONFIG_FILE'] = f.name
             f.write('rules: {trailing-spaces: enable}')
             f.flush()
@@ -499,8 +507,13 @@ def test_run_default_format_output_in_tty(self):
         path = os.path.join(self.wd, 'a.yaml')
 
         # Create a pseudo-TTY and redirect stdout to it
+        old_stdout, old_stderr = sys.stdout, sys.stderr
         master, slave = pty.openpty()
-        sys.stdout = sys.stderr = os.fdopen(slave, 'w')
+        sys.stdout = sys.stderr = os.fdopen(
+            slave,
+            'w',
+            encoding=os.device_encoding(slave)
+        )
 
         with self.assertRaises(SystemExit) as ctx:
             cli.run((path, ))
@@ -509,7 +522,7 @@ def test_run_default_format_output_in_tty(self):
         self.assertEqual(ctx.exception.code, 1)
 
         # Read output from TTY
-        output = os.fdopen(master, 'r')
+        output = os.fdopen(master, 'r', encoding=os.device_encoding(master))
         flag = fcntl.fcntl(master, fcntl.F_GETFD)
         fcntl.fcntl(master, fcntl.F_SETFL, flag | os.O_NONBLOCK)
 
@@ -518,6 +531,7 @@ def test_run_default_format_output_in_tty(self):
         sys.stdout.close()
         sys.stderr.close()
         output.close()
+        sys.stdout, sys.stderr = old_stdout, old_stderr
 
         self.assertEqual(out, (
             f'\033[4m{path}\033[0m\n'
@@ -817,3 +831,52 @@ def test_multiple_parent_config_file(self):
         self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
                          (0, './4spaces.yml:2:5: [warning] wrong indentation: '
                          'expected 3 but found 4 (indentation)\n', ''))
+
+
+class CommandLineEncodingTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        register_test_codecs()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        unregister_test_codecs()
+
+    def test_valid_encodings(self):
+        conf = ('---\n'
+                'rules:\n'
+                '  key-ordering: enable\n')
+        config_files = temp_workspace_with_files_in_many_codecs(
+            'config_{}.yaml',
+            conf
+        )
+        sorted_correctly = ('---\n'
+                            'Ａ: YAML\n'
+                            'Ｚ: YAML\n')
+        sorted_correctly_files = temp_workspace_with_files_in_many_codecs(
+            'sorted_correctly/{}.yaml',
+            sorted_correctly
+        )
+        sorted_incorrectly = ('---\n'
+                              'Ｚ: YAML\n'
+                              'Ａ: YAML\n')
+        sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs(
+            'sorted_incorrectly/{}.yaml',
+            sorted_incorrectly
+        )
+        workspace = {
+            **config_files,
+            **sorted_correctly_files,
+            **sorted_incorrectly_files
+        }
+
+        with temp_workspace(workspace):
+            for config_path in config_files.keys():
+                with RunContext(self) as ctx:
+                    cli.run(('-c', config_path, 'sorted_correctly/'))
+                self.assertEqual(ctx.returncode, 0)
+                with RunContext(self) as ctx:
+                    cli.run(('-c', config_path, 'sorted_incorrectly/'))
+                self.assertNotEqual(ctx.returncode, 0)