Skip to content

Commit

Permalink
Add lzma codec with xz-decompress package (#130)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin authored Dec 1, 2023
1 parent baf2c1d commit 551fa9d
Show file tree
Hide file tree
Showing 30 changed files with 1,854 additions and 1,504 deletions.
11 changes: 3 additions & 8 deletions .eslintrc.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
{
"parser": "@typescript-eslint/parser",
"plugins": [
"prettier",
"@typescript-eslint"
],
"plugins": ["prettier", "@typescript-eslint"],
"extends": [
"plugin:@typescript-eslint/recommended",
"plugin:prettier/recommended"
Expand All @@ -12,12 +9,10 @@
"no-underscore-dangle": 0,
"curly": "error",
"@typescript-eslint/no-explicit-any": 0,
"@typescript-eslint/no-unused-vars": 0,
"@typescript-eslint/explicit-module-boundary-types": 0,
"@typescript-eslint/ban-ts-comment": 0,
"semi": [
"error",
"never"
],
"semi": ["error", "never"],
"prettier/prettier": [
"error",
{
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ on: push

jobs:
test:
name: Lint, build, and test on node 14.x and ubuntu-latest
name: Lint, build, and test on node 20.x and ubuntu-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Use Node.js 14.x
uses: actions/setup-node@v1
- uses: actions/checkout@v3
- name: Use Node.js 20.x
uses: actions/setup-node@v3
with:
node-version: 14.x
node-version: 20.x
- name: Install deps (with cache)
uses: bahmutov/npm-install@v1
- name: Lint codebase
Expand Down
4 changes: 2 additions & 2 deletions .prettierrc → .prettierrc.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"semi": false,
"trailingComma": "all",
"singleQuote": true,
"trailingComma": "all",
"arrowParens": "avoid",
"endOfLine": "auto"
"proseWrap": "always"
}
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@

# v1.6.3

- Optimize CRAM parsing slightly (15% improvement on many short reads). This removes support for big endian machines
- Optimize CRAM parsing slightly (15% improvement on many short reads). This
removes support for big endian machines
- Publish src directory for sourceMap

# v1.6.2
Expand Down Expand Up @@ -93,7 +94,8 @@

# v1.4.3

- Make sure mate exists for unmated pair, can exist when coordinate slices of cram file are made via samtools view
- Make sure mate exists for unmated pair, can exist when coordinate slices of
cram file are made via samtools view

# v1.4.2

Expand Down
276 changes: 180 additions & 96 deletions README.md

Large diffs are not rendered by default.

7 changes: 2 additions & 5 deletions jest.config.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
/** @type {import("ts-jest/dist/types").InitialOptionsTsJest} */
/** @type {import('ts-jest').JestConfigWithTsJest} */
module.exports = {
preset: 'ts-jest',
transform: {
'^.+\\.(ts|js)x?$': 'ts-jest',
},
testEnvironment: 'node',
transformIgnorePatterns: ['[/\\\\]node_modules[/\\\\](?!quick-lru/).+\\.js$'],
testTimeout: 1000,
}
17 changes: 9 additions & 8 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,34 +50,35 @@
"long": "^4.0.0",
"md5": "^2.2.1",
"pako": "^1.0.4",
"quick-lru": "^4.0.1"
"quick-lru": "^4.0.1",
"xz-decompress": "^0.2.1"
},
"devDependencies": {
"@babel/plugin-transform-modules-commonjs": "^7.18.2",
"@babel/preset-typescript": "^7.17.12",
"@gmod/indexedfasta": "^2.0.2",
"@gmod/indexedfasta": "^2.1.0",
"@types/buffer-crc32": "^0.2.2",
"@types/jest": "^29.2.4",
"@types/long": "^4.0.2",
"@types/md5": "^2.3.2",
"@types/pako": "^1.0.3",
"@typescript-eslint/eslint-plugin": "^5.46.1",
"@typescript-eslint/parser": "^5.46.1",
"@typescript-eslint/eslint-plugin": "^6.9.1",
"@typescript-eslint/parser": "^6.9.1",
"buffer": "^6.0.3",
"documentation": "^14.0.1",
"eslint": "^8.30.0",
"eslint-config-prettier": "^8.5.0",
"eslint-config-prettier": "^9.0.0",
"eslint-plugin-import": "^2.25.4",
"eslint-plugin-prettier": "^4.0.0",
"eslint-plugin-prettier": "^5.0.1",
"glob": "^10.3.1",
"jest": "^29.3.1",
"mock-fs": "^5.2.0",
"prettier": "^2.8.1",
"prettier": "^3.0.3",
"rimraf": "^5.0.1",
"ts-jest": "^29.0.3",
"typescript": "^5.0.3",
"url": "^0.11.0",
"webpack": "5.88.1",
"webpack": "5.89.0",
"webpack-cli": "^5.0.1"
},
"publishConfig": {
Expand Down
5 changes: 4 additions & 1 deletion src/cramFile/container/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ import CramContainerCompressionScheme from './compressionScheme'
import CramFile from '../file'

export default class CramContainer {
constructor(public file: CramFile, public filePosition: number) {}
constructor(
public file: CramFile,
public filePosition: number,
) {}

// memoize
getHeader() {
Expand Down
42 changes: 23 additions & 19 deletions src/cramFile/file.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import { unzip } from '../unzip'
import crc32 from 'buffer-crc32'
import QuickLRU from 'quick-lru'
import htscodecs from '@jkbonfield/htscodecs'
import { Parser } from '@gmod/binary-parser'
// @ts-expect-error
import bzip2 from 'bzip2'

import { XzReadableStream } from 'xz-decompress'
import { CramMalformedError, CramUnimplementedError } from '../errors'
import ransuncompress from '../rans'
import {
Expand All @@ -12,16 +14,24 @@ import {
cramFileDefinition as cramFileDefinitionParser,
getSectionParsers,
} from './sectionParsers'
import htscodecs from '@jkbonfield/htscodecs'

import CramContainer from './container'

import { open } from '../io'
import { parseItem, tinyMemoize } from './util'
import { parseHeaderText } from '../sam'
import { Parser } from '@gmod/binary-parser'
import CramRecord from './record'
import { Filehandle } from './filehandle'

function bufferToStream(buf: Buffer) {
return new ReadableStream({
start(controller) {
controller.enqueue(buf)
controller.close()
},
})
}

//source:https://abdulapopoola.com/2019/01/20/check-endianness-with-javascript/
function getEndianness() {
const uInt32 = new Uint32Array([0x11223344])
Expand Down Expand Up @@ -99,17 +109,6 @@ export default class CramFile {
}
}

// toString() {
// if (this.file.filename) {
// return this.file.filename
// }
// if (this.file.url) {
// return this.file.url
// }
//
// return '(cram file)'
// }

// can just read this object like a filehandle
read(
buffer: Buffer,
Expand Down Expand Up @@ -247,9 +246,8 @@ export default class CramFile {
let containerCount = 0
let position = sectionParsers.cramFileDefinition.maxLength
while (position + cramContainerHeader1.maxLength + 8 < fileSize) {
const currentHeader = await this.getContainerAtPosition(
position,
).getHeader()
const currentHeader =
await this.getContainerAtPosition(position).getHeader()
if (!currentHeader) {
break
}
Expand Down Expand Up @@ -319,7 +317,7 @@ export default class CramFile {
return data
}

_uncompress(
async _uncompress(
compressionMethod: CompressionMethod,
inputBuffer: Buffer,
outputBuffer: Buffer,
Expand All @@ -340,6 +338,12 @@ export default class CramFile {
size -= chunk.length
}
} while (chunk != -1)
} else if (compressionMethod === 'lzma') {
const decompressedResponse = new Response(
new XzReadableStream(bufferToStream(inputBuffer)),
)
const ret = Buffer.from(await decompressedResponse.arrayBuffer())
ret.copy(outputBuffer)
} else if (compressionMethod === 'rans') {
ransuncompress(inputBuffer, outputBuffer)
//htscodecs r4x8 is slower, but compatible.
Expand Down Expand Up @@ -386,7 +390,7 @@ export default class CramFile {
blockContentPosition,
)

this._uncompress(
await this._uncompress(
blockHeader.compressionMethod,
compressedData,
uncompressedData,
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions src/rans/d04.js → src/rans/d04.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//@ts-nocheck
import { CramMalformedError } from '../errors'

import { TF_SHIFT } from './constants'
Expand Down
1 change: 1 addition & 0 deletions src/rans/d14.js → src/rans/d14.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//@ts-nocheck
import { TF_SHIFT } from './constants'
import Decoding from './decoding'

Expand Down
1 change: 1 addition & 0 deletions src/rans/decoding.js → src/rans/decoding.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//@ts-nocheck
import { CramMalformedError } from '../errors'

import { RANS_BYTE_L } from './constants'
Expand Down
1 change: 1 addition & 0 deletions src/rans/frequencies.js → src/rans/frequencies.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//@ts-nocheck
import { CramMalformedError } from '../errors'

import { TOTFREQ } from './constants'
Expand Down
1 change: 1 addition & 0 deletions src/rans/index.js → src/rans/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//@ts-nocheck
import { CramMalformedError } from '../errors'

import Decoding from './decoding'
Expand Down
23 changes: 23 additions & 0 deletions test/__snapshots__/compressions.test.ts.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP

exports[`bzip2 1`] = `
{
"code": "X",
"data": 1,
"pos": 51,
"ref": "A",
"refPos": 1050,
"sub": "G",
}
`;

exports[`lzma 1`] = `
{
"code": "X",
"data": 1,
"pos": 51,
"ref": "A",
"refPos": 1050,
"sub": "G",
}
`;
31 changes: 31 additions & 0 deletions test/compressions.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//@ts-nocheck
import { testDataFile } from './lib/util'
import { dumpWholeFile } from './lib/dumpFile'
import { CramFile } from '../src/index'
import { FetchableSmallFasta } from './lib/fasta'

test('lzma', async () => {
const fasta = new FetchableSmallFasta(testDataFile('ce.fa'))
const seqFetch = fasta.fetch.bind(fasta)
const file = new CramFile({
filehandle: testDataFile('hts-specs/0903_comp_lzma.cram'),
seqFetch,
})
const fileData = await dumpWholeFile(file)
const feat = fileData[2].data[1].features[0]
const hardClip = feat.readFeatures[0]
expect(hardClip).toMatchSnapshot()
})

test('bzip2', async () => {
const fasta = new FetchableSmallFasta(testDataFile('ce.fa'))
const seqFetch = fasta.fetch.bind(fasta)
const file = new CramFile({
filehandle: testDataFile('hts-specs/0902_comp_bz2.cram'),
seqFetch,
})
const fileData = await dumpWholeFile(file)
const feat = fileData[2].data[1].features[0]
const hardClip = feat.readFeatures[0]
expect(hardClip).toMatchSnapshot()
})
Binary file added test/data/hts-specs/0902_comp_bz2.cram
Binary file not shown.
7 changes: 7 additions & 0 deletions test/data/hts-specs/0902_comp_bz2.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd UR:/nfs/users/nfs_j/jkb/work/samtools_master/hts-specs/test/CRAM/passed/ce.fa
@RG ID:rg SM:test
@RG ID:rg2 SM:test
r1 99 CHROMOSOME_I 1000 40 100M = 1200 300 ATTTTTCGGGTTTTTTGAAATGAATATCGTAGCTACAGAAACGGTTGTGCGNGCATCTGAAAGTTTGTTTTTCTTGTTTTCTTGCACTTTGTGCAGAATT #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg
r1 147 CHROMOSOME_I 1200 40 100M = 1000 -300 TTTTTTTAGAAAAATTATTTTTAAGAATTTTTCATTTTAGGAATATTGTTCNCTCAGAAAATAGCTAAATGTGATTTCTGTAATTTTGCCTGCCAAATTC #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg
r2 99 CHROMOSOME_I 1000 40 100M = 1200 300 ATTTTTCGGGTTTTTTGAAATGAATATCGTAGCTACAGAAACGGTTGTGCGNGCATCTGAAAGTTTGTTTTTCTTGTTTTCTTGCACTTTGTGCAGAATT #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg2
r2 147 CHROMOSOME_I 1200 40 100M = 1000 -300 TTTTTTTAGAAAAATTATTTTTAAGAATTTTTCATTTTAGGAATATTGTTCNCTCAGAAAATAGCTAAATGTGATTTCTGTAATTTTGCCTGCCAAATTC #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg2
Binary file added test/data/hts-specs/0903_comp_lzma.cram
Binary file not shown.
7 changes: 7 additions & 0 deletions test/data/hts-specs/0903_comp_lzma.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd UR:/nfs/users/nfs_j/jkb/work/samtools_master/hts-specs/test/CRAM/passed/ce.fa
@RG ID:rg SM:test
@RG ID:rg2 SM:test
r1 99 CHROMOSOME_I 1000 40 100M = 1200 300 ATTTTTCGGGTTTTTTGAAATGAATATCGTAGCTACAGAAACGGTTGTGCGNGCATCTGAAAGTTTGTTTTTCTTGTTTTCTTGCACTTTGTGCAGAATT #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg
r1 147 CHROMOSOME_I 1200 40 100M = 1000 -300 TTTTTTTAGAAAAATTATTTTTAAGAATTTTTCATTTTAGGAATATTGTTCNCTCAGAAAATAGCTAAATGTGATTTCTGTAATTTTGCCTGCCAAATTC #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg
r2 99 CHROMOSOME_I 1000 40 100M = 1200 300 ATTTTTCGGGTTTTTTGAAATGAATATCGTAGCTACAGAAACGGTTGTGCGNGCATCTGAAAGTTTGTTTTTCTTGTTTTCTTGCACTTTGTGCAGAATT #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg2
r2 147 CHROMOSOME_I 1200 40 100M = 1000 -300 TTTTTTTAGAAAAATTATTTTTAAGAATTTTTCATTTTAGGAATATTGTTCNCTCAGAAAATAGCTAAATGTGATTTCTGTAATTTTGCCTGCCAAATTC #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC RG:Z:rg2
33 changes: 15 additions & 18 deletions test/dump.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,22 @@ describe('dumping cram files', () => {
}, 10000)
})
})

describe('works with hard clipping', () => {
it('hard clipped volvox data file', async () => {
const fasta = new FetchableSmallFasta(testDataFile('volvox.fa'))
const seqFetch = fasta.fetch.bind(fasta)
const file = new CramFile({
filehandle: testDataFile('hard_clipping.cram'),
seqFetch,
})
const fileData = await dumpWholeFile(file)
const feat = fileData[2].data[1].features[0]
const hardClip = feat.readFeatures[0]
const nextReadFeature = feat.readFeatures[0]
expect(hardClip.refPos).toEqual(737)
expect(nextReadFeature.refPos).toEqual(737)
expect(hardClip.refPos).toEqual(feat.alignmentStart)
expect(hardClip.pos).toEqual(1)
expect(hardClip.data).toEqual(803)
test('works with hard clipping', async () => {
const fasta = new FetchableSmallFasta(testDataFile('volvox.fa'))
const seqFetch = fasta.fetch.bind(fasta)
const file = new CramFile({
filehandle: testDataFile('hard_clipping.cram'),
seqFetch,
})
const fileData = await dumpWholeFile(file)
const feat = fileData[2].data[1].features[0]
const hardClip = feat.readFeatures[0]
const nextReadFeature = feat.readFeatures[0]
expect(hardClip.refPos).toEqual(737)
expect(nextReadFeature.refPos).toEqual(737)
expect(hardClip.refPos).toEqual(feat.alignmentStart)
expect(hardClip.pos).toEqual(1)
expect(hardClip.data).toEqual(803)
})

function isIterable(input) {
Expand Down
1 change: 1 addition & 0 deletions test/lib/dumpFile.js → test/lib/dumpFile.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//@ts-nocheck
async function dumpSlice(container, sliceOffset) {
const slice = container.getSlice(sliceOffset)
const header = await slice.getHeader()
Expand Down
Loading

0 comments on commit 551fa9d

Please sign in to comment.