Skip to content

Commit

Permalink
Merge pull request #43 from keboola/zajca-bom
Browse files Browse the repository at this point in the history
utf8 bom detection
  • Loading branch information
zajca authored Aug 26, 2021
2 parents eb5a835 + f99d35f commit efd0fae
Show file tree
Hide file tree
Showing 10 changed files with 102 additions and 4 deletions.
10 changes: 7 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@ php:
- 7.3
- 7.4

before_script:
env:
global:
- XDEBUG_MODE=coverage

before_script:
- composer install
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
- chmod +x ./cc-test-reporter
- export GIT_COMMIT_SHA=$TRAVIS_COMMIT
- export GIT_BRANCH=$TRAVIS_BRANCH
- export GIT_BRANCH=$TRAVIS_BRANCH
- ./cc-test-reporter before-build

script:
Expand All @@ -22,6 +26,6 @@ after_success:
- ./cc-test-reporter after-build --exit-code 0 --debug

notifications:
email: false
email: false
slack:
secure: WVnUU0fkZS75md3mm7B08SxhP3HDeHbJ8GTPR1DUVjK3MHAmKeSah/plNNxn9I/TdlXnHzQO5WBN33nUq0ODGGT4WFzFa66YTX2tb+bNSmewBOv82hEoITTI1PI9SLq0WNtcamHWCM3Rt1XtiZb3DQk/OcUfiWrrn74q4PPX+VY=
2 changes: 1 addition & 1 deletion src/CsvReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public function __construct(
$this->validateLineBreak();

rewind($this->filePointer);
$this->header = $this->readLine();
$this->header = UTF8BOMHelper::detectAndRemoveBOM($this->readLine());
$this->rewind();
}

Expand Down
37 changes: 37 additions & 0 deletions src/UTF8BOMHelper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?php

namespace Keboola\Csv;

class UTF8BOMHelper
{
/**
* @param array $header
* @return array
*/
public static function detectAndRemoveBOM($header)
{
if (!is_array($header)) {
return $header;
}
$utf32BigEndianBom = chr(0x00) . chr(0x00) . chr(0xFE) . chr(0xFF);
$utf32LittleEndianBom = chr(0xFF) . chr(0xFE) . chr(0x00) . chr(0x00);
$utf16BigEndianBom = chr(0xFE) . chr(0xFF);
$utf16LittleEndianBom = chr(0xFF) . chr(0xFE);
$utf8Bom = chr(0xEF) . chr(0xBB) . chr(0xBF);

foreach ([
$utf32BigEndianBom,
$utf32LittleEndianBom,
$utf16BigEndianBom,
$utf16LittleEndianBom,
$utf8Bom,
] as $bomString) {
if (strpos($header[0], $bomString) === 0) {
$header[0] = trim(substr($header[0], strlen($bomString)), '"');
break;
}
}

return $header;
}
}
20 changes: 20 additions & 0 deletions tests/CsvReadTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,26 @@ public function testParseEscapedBy()
self::assertEquals($expected, iterator_to_array($csvFile));
}

/**
* @dataProvider bomProvider
*/
public function testUtf8BOM($bomFile)
{
$csvFile = new CsvReader(__DIR__ . '/data/bom/' . $bomFile . '.csv');
self::assertEquals(['id', 'name',], $csvFile->getHeader());
}

public function bomProvider()
{
return [
['utf32BigEndianBom'],
['utf32LittleEndianBom'],
['utf16BigEndianBom'],
['utf16LittleEndianBom'],
['utf8Bom'],
];
}

public function testParseMacLineEndsInField()
{
$csvFile = new CsvReader(__DIR__ . '/data/test-input.lineBreaks.csv', ",", '"', '\\');
Expand Down
34 changes: 34 additions & 0 deletions tests/UTF8BOMHelperTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?php

namespace Keboola\Csv\Tests;

use Keboola\Csv\CsvReader;
use Keboola\Csv\UTF8BOMHelper;
use PHPUnit\Framework\TestCase;

class UTF8BOMHelperTest extends TestCase
{
/**
* @dataProvider bomProvider
* @param string $bomFile
*/
public function testDetectAndRemoveBOM($bomFile)
{
$file = __DIR__ . '/data/bom/' . $bomFile . '.csv';
$reader = new CsvReader($file);
$firstLine = $reader->current();
$this->assertNotSame(['id', 'name'], $firstLine);
$this->assertSame(['id', 'name'], UTF8BOMHelper::detectAndRemoveBOM($firstLine));
}

public function bomProvider()
{
return [
['utf32BigEndianBom'],
['utf32LittleEndianBom'],
['utf16BigEndianBom'],
['utf16LittleEndianBom'],
['utf8Bom'],
];
}
}
1 change: 1 addition & 0 deletions tests/data/bom/utf16BigEndianBom.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
��"id","name"
1 change: 1 addition & 0 deletions tests/data/bom/utf16LittleEndianBom.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
��"id","name"
Binary file added tests/data/bom/utf32BigEndianBom.csv
Binary file not shown.
Binary file added tests/data/bom/utf32LittleEndianBom.csv
Binary file not shown.
1 change: 1 addition & 0 deletions tests/data/bom/utf8Bom.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"id","name"

0 comments on commit efd0fae

Please sign in to comment.