Skip to content

Commit

Permalink
Fix for some CRAM 3.1 codecs failing to parse (#144)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin authored Nov 8, 2024
1 parent 6f20f69 commit bebd8e8
Show file tree
Hide file tree
Showing 28 changed files with 5,438 additions and 819 deletions.
9 changes: 8 additions & 1 deletion eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,14 @@ import tseslint from 'typescript-eslint'

export default tseslint.config(
{
ignores: ['esm/**/*', 'dist/**/*', '*.js', '*.mjs', 'example/*'],
ignores: [
'esm/**/*',
'dist/**/*',
'*.js',
'*.mjs',
'example/*',
'src/htscodecs',
],
},
{
languageOptions: {
Expand Down
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
"biojs"
],
"dependencies": {
"@jkbonfield/htscodecs": "^0.5.1",
"bzip2": "^0.1.1",
"crc": "^4.3.2",
"long": "^4.0.0",
Expand Down
18 changes: 8 additions & 10 deletions src/cramFile/file.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Buffer } from 'buffer'
import crc32 from 'crc/crc32'
import QuickLRU from 'quick-lru'
import htscodecs from '@jkbonfield/htscodecs'
import htscodecs from '../htscodecs'
import bzip2 from 'bzip2'
import { XzReadableStream } from 'xz-decompress'
import { CramMalformedError, CramUnimplementedError } from '../errors'
Expand Down Expand Up @@ -137,12 +137,8 @@ export default class CramFile {
return parseHeaderText('')
}
const content = firstBlock.content
// find the end of the trailing zeros in the header text
const headerLength = content.readInt32LE(0)
const textStart = 4
// let textEnd = content.length - 1
// while (textEnd >= textStart && !content[textEnd]) textEnd -= 1
// trim off the trailing zeros
const text = content.toString('utf8', textStart, textStart + headerLength)
this.header = text
return parseHeaderText(text)
Expand Down Expand Up @@ -189,7 +185,8 @@ export default class CramFile {
position = block._endPosition
}
} else {
// otherwise, just traverse to the next container using the container's length
// otherwise, just traverse to the next container using the container's
// length
position += currentHeader._size + currentHeader.length
}
}
Expand Down Expand Up @@ -230,9 +227,9 @@ export default class CramFile {
if (!currentHeader) {
break
}
// if this is the first container, read all the blocks in the
// container, because we cannot trust the container
// header's given length due to a bug somewhere in htslib
// if this is the first container, read all the blocks in the container,
// because we cannot trust the container header's given length due to a
// bug somewhere in htslib
if (containerCount === 0) {
position = currentHeader._endPosition
for (let j = 0; j < currentHeader.numBlocks; j++) {
Expand All @@ -243,7 +240,8 @@ export default class CramFile {
position = block._endPosition
}
} else {
// otherwise, just traverse to the next container using the container's length
// otherwise, just traverse to the next container using the container's
// length
position += currentHeader._size + currentHeader.length
}
containerCount += 1
Expand Down
1 change: 0 additions & 1 deletion src/cramFile/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ export function tinyMemoize(_class: any, methodName: any) {
const res = method.call(this)
this[memoAttrName] = res
Promise.resolve(res).catch(() => {

delete this[memoAttrName]
})
}
Expand Down
142 changes: 142 additions & 0 deletions src/htscodecs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
all: test

CBIN=../build/tests
NODE=node
NODE_OPTS=--use-strict

TESTS=test_r4x8 test_r4x16 test_arith test_fqzcomp test_tok3
CORPUS=../tests

modules: node_modules/bzip2 node_modules/minimist

node_modules/bzip2:
-mkdir node_modules
npm install bzip2

node_modules/minimist:
-mkdir node_modules
npm install minimist

test check: modules ${TESTS}

test_r4x8:
@echo
@echo === Checking r4x8
@if [ ! -e ${CBIN}/rans4x8 ]; then echo "Set CBIN if you wish to validate against C version"; fi
@for i in ${CORPUS}/dat/r4x8/*; do \
echo $$i;\
base=`echo $$i | sed 's/\.[0-9]*$$//;s#/q#/../q#'`; \
level=`echo $$i | sed 's/.*\.//'`;\
# Decode predefined data set\
a=`${NODE} ${NODE_OPTS} main_rans.js -d -r $$i 2>/dev/null | md5sum`; \
b=`cut -f 1 < $$base | tr -d '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Mismatch for $$i; \
# Round trip. \
cut -f 1 < $$base | tr -d '\012' > _nonl; \
${NODE} ${NODE_OPTS} main_rans.js -o $$level -r _nonl 2>/dev/null > _; \
a=`${NODE} ${NODE_OPTS} main_rans.js -d -r _ 2>/dev/null | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail round-trip for $$base with level $$level; \
if [ -e ${CBIN}/rans4x8 ]; \
then \
a=`${CBIN}/rans4x8 -d -r < _ 2>/dev/null | tr '\000' '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail JS to C round-trip for $$base; \
fi; \
done

test_r4x16:
@echo
@echo === Checking r4x16
@if [ ! -e ${CBIN}/rans4x16pr ]; then echo "Set CBIN if you wish to validate against C version"; fi
@for i in ${CORPUS}/dat/r4x16/*; do \
echo $$i;\
base=`echo $$i | sed 's/\.[0-9]*$$//;s#/q#/../q#'`; \
level=`echo $$i | sed 's/.*\.//'`;\
# Decode predefined data set\
a=`${NODE} ${NODE_OPTS} main_rans4x16.js -d -r $$i 2>/dev/null | md5sum`; \
b=`cut -f 1 < $$base | tr -d '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Mismatch for $$i; \
# Round trip. \
cut -f 1 < $$base | tr -d '\012' > _nonl; \
${NODE} ${NODE_OPTS} main_rans4x16.js -o $$level -r _nonl 2>/dev/null > _; \
a=`${NODE} ${NODE_OPTS} main_rans4x16.js -d -r _ 2>/dev/null | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail round-trip for $$base with level $$level; \
if [ -e ${CBIN}/rans4x16pr ]; \
then \
a=`${CBIN}/rans4x16pr -d -r < _ 2>/dev/null | tr '\000' '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail JS to C round-trip for $$base; \
fi; \
done

test_arith:
@echo
@echo === Checking arith
@if [ ! -e ${CBIN}/arith_dynamic ]; then echo "Set CBIN if you wish to validate against C version"; fi
@for i in ${CORPUS}/dat/arith/*; do \
echo $$i;\
base=`echo $$i | sed 's/\.[0-9]*$$//;s#/q#/../q#'`; \
level=`echo $$i | sed 's/.*\.//'`;\
# Decode predefined data set\
a=`${NODE} ${NODE_OPTS} main_arith_gen.js -d -r $$i 2>/dev/null | md5sum`; \
b=`cut -f 1 < $$base | tr -d '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Mismatch for $$i; \
# Round trip. \
cut -f 1 < $$base | tr -d '\012' > _nonl; \
${NODE} ${NODE_OPTS} main_arith_gen.js -o $$level -r _nonl 2>/dev/null > _; \
a=`${NODE} ${NODE_OPTS} main_arith_gen.js -d -r _ 2>/dev/null | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail round-trip for $$base with level $$level; \
if [ -e ${CBIN}/arith_dynamic ]; \
then \
a=`${CBIN}/arith_dynamic -d -r < _ 2>/dev/null | tr '\000' '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail JS to C round-trip for $$base; \
fi; \
done

test_fqzcomp:
@echo
@echo === Checking fqzcomp
@if [ ! -e ${CBIN}/fqzcomp_qual ]; then echo "Set CBIN if you wish to validate against C version"; fi
@for i in ${CORPUS}/dat/fqzcomp/q*; do \
echo $$i;\
base=`echo $$i | sed 's/\.[0-9]*$$//;s#/q#/../q#'`; \
level=`echo $$i | sed 's/.*\.//'`;\
# Decode predefined data set\
a=`${NODE} ${NODE_OPTS} main_fqzcomp.js -d -r $$i 2>/dev/null | md5sum`; \
b=`awk '{print $$1}' $$base | md5sum`; \
test "$$a" = "$$b" || echo $<: Mismatch for $$i; \
# Round trip. \
${NODE} ${NODE_OPTS} main_fqzcomp.js -s $$level -r $$base 2>/dev/null > _; \
a=`${NODE} ${NODE_OPTS} main_fqzcomp -d -r _ 2>/dev/null | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail round-trip for $$base with level $$level; \
if [ -e ${CBIN}/fqzcomp_qual ]; \
then \
a=`${CBIN}/fqzcomp_qual -d -r < _ 2>/dev/null | tr '\000' '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail JS to C round-trip for $$base; \
fi; \
done

test_tok3:
@echo
@echo === Checking tok3
@if [ ! -e ${CBIN}/tokenise_name3 ]; then echo "Set CBIN if you wish to validate against C version"; fi
@for base in ${CORPUS}/names/*.names; do \
echo -n "$$base ";\
${NODE} ${NODE_OPTS} main_tok3.js -a -r $$base 2>/dev/null > _; \
a=`${NODE} ${NODE_OPTS} main_tok3 -d -r _ 2>/dev/null | md5sum`; \
cat _ | wc -c;\
b=`cat $$base | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail round-trip for $$base; \
if [ -e ${CBIN}/tokenise_name3 ]; \
then \
a=`${CBIN}/tokenise_name3 -d -r < _ | tr '\000' '\012' | md5sum`; \
test "$$a" = "$$b" || echo $<: Fail JS to C round-trip for $$base; \
fi; \
done;
@for i in ${CORPUS}/names/tok3/*; do \
echo $$i;\
base=`echo $$i | sed 's/\.[0-9]*$$//;s#/tok3##'`; \
level=`echo $$i | sed 's/.*\.//'`;\
# Decode predefined data set\
a=`${NODE} ${NODE_OPTS} main_tok3 -d -r $$i 2>/dev/null | md5sum`; \
b=`cat $$base | md5sum`; \
test "$$a" = "$$b" || echo $<: Mismatch for $$i; \
done;
64 changes: 64 additions & 0 deletions src/htscodecs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Reference implementation files

This directory contains javascript implementations of the custom codecs using in
CRAM 3.1, capable of being run under node.js.

These is not written for speed, but for clarity and as an exercise in checking
the pseudocode in the CRAM specification. It is written as close to this
pseudocode as is possible.

Prerequisites: minimist package for command line parsing and bzip2 for part of
the arith_gen.js code.

npm install minimist
npm install bzip2

## iostream.js

Makes a buffer appear to be a stream with ReadByte, ReadITF8, etc functions.

## rans.js

Implements the order-0 and order-1 rans (4x8) decoder as used in CRAM3.0.

## main_rans.js

A command line tool to exercise the rans.js code, included for debug purposes.

## rans4x16.js, main_rans4x16.js

A 16-bit renormalising variant of rANS above. This also includes transforms for
RLE, bit-packing and 4-way interleaving.

## arith_sh.js

Arithmetic (range) coding with Schindler carry handling.

## byte_model.js

An adaptive model for keeping track of symbol frequencies.

## arith_gen.js, main_arith_gen.js

Wrapper around arith_sh.js to perform order-0/1 encoding with RLE and
bit-packing. Plus debug command line tool

## fqzcomp.js, main_fqzcomp.js

Implements the fqzcomp quality compression codec. Plus debug command line tool.

## tok3.js, main_tok3.js

Implements the tokenise_name3 read identifier compression codec. Plus debug
command line tool.

# Testing

The various main js files can be used for adhoc testing. There is also a
Makefile which performs checks against known defined data streams and does
round-trip testing in both Javascript and if compiled the C variant. You can set
CORPUS make variable to a larger data set such htscodecs-corpus.

eg.

make check CORPUS=../tests/htscodecs-corpus/
Loading

0 comments on commit bebd8e8

Please sign in to comment.