Skip to content

Commit

Permalink
No binary parser
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin committed Aug 1, 2024
1 parent e90915a commit 9c43f31
Show file tree
Hide file tree
Showing 23 changed files with 1,072 additions and 9,554 deletions.
7 changes: 6 additions & 1 deletion eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,12 @@ export default [
ignoreRestSiblings: true,
},
],

'no-console': [
'warn',
{
allow: ['error', 'warn'],
},
],
'no-underscore-dangle': 0,
curly: 'error',
'@typescript-eslint/no-explicit-any': 0,
Expand Down
11 changes: 5 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
],
"dependencies": {
"@gmod/abortable-promise-cache": "^2.0.0",
"@gmod/binary-parser": "^1.3.5",
"@jkbonfield/htscodecs": "^0.5.1",
"buffer-crc32": "^1.0.0",
"bzip2": "^0.1.1",
Expand All @@ -59,18 +58,18 @@
"@types/long": "^4.0.0",
"@types/md5": "^2.3.2",
"@types/pako": "^1.0.3",
"@typescript-eslint/eslint-plugin": "^7.0.2",
"@typescript-eslint/parser": "^7.0.2",
"@typescript-eslint/eslint-plugin": "^8.0.0",
"@typescript-eslint/parser": "^8.0.0",
"buffer": "^6.0.3",
"documentation": "^14.0.3",
"eslint": "^9.0.0",
"eslint": "^9.8.0",
"eslint-config-prettier": "^9.0.0",
"eslint-plugin-prettier": "^5.1.3",
"eslint-plugin-unicorn": "^54.0.0",
"eslint-plugin-unicorn": "^55.0.0",
"jest": "^29.3.1",
"mock-fs": "^5.2.0",
"prettier": "^3.2.5",
"rimraf": "^5.0.1",
"rimraf": "^6.0.1",
"ts-jest": "^29.1.2",
"typescript": "^5.0.3",
"webpack": "^5.90.3",
Expand Down
6 changes: 0 additions & 6 deletions src/cramFile/codecs/byteArrayLength.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import { tinyMemoize } from '../util'

import CramCodec, { Cursors } from './_base'
import { ByteArrayLengthEncoding, CramEncoding } from '../encoding'
import CramSlice from '../slice'
Expand Down Expand Up @@ -71,7 +69,3 @@ export default class ByteArrayStopCodec extends CramCodec<
return this.instantiateCodec(encodingParams, 'byte')
}
}

'_getLengthCodec _getDataCodec'
.split(' ')
.forEach(method => tinyMemoize(ByteArrayStopCodec, method))
6 changes: 5 additions & 1 deletion src/cramFile/codecs/external.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,14 @@ export default class ExternalCodec extends CramCodec<
cursors: Cursors,
) {
const { blockContentId } = this.parameters
// console.log({
// blocksByContentId: Object.keys(blocksByContentId),
// blockContentId,
// })
const contentBlock = blocksByContentId[blockContentId]
if (!contentBlock) {
throw new CramMalformedError(
`no block found with content ID ${blockContentId}`,
`no block found with content ID ${blockContentId}: available ${Object.keys(blocksByContentId)}`,
)
}
const cursor = cursors.externalBlocks.getCursor(blockContentId)
Expand Down
3 changes: 2 additions & 1 deletion src/cramFile/codecs/huffman.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ export default class HuffmanIntCodec extends CramCodec<
this.buildCodes()
this.buildCaches()

// if this is a degenerate zero-length huffman code, special-case the decoding
// if this is a degenerate zero-length huffman code, special-case the
// decoding
if (this.sortedCodes[0].bitLength === 0) {
this._decode = this._decodeZeroLengthCode
}
Expand Down
9 changes: 1 addition & 8 deletions src/cramFile/container/compressionScheme.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { instantiateCodec } from '../codecs'
import CramCodec from '../codecs/_base'
import { CramCompressionHeader, CramPreservationMap } from '../sectionParsers'
import { CramCompressionHeader } from '../sectionParsers'
import { CramEncoding } from '../encoding'
import { CramMalformedError } from '../../errors'
import {
Expand Down Expand Up @@ -95,12 +95,8 @@ export default class CramContainerCompressionScheme {
public tagCodecCache: Record<string, CramCodec> = {}
public tagEncoding: Record<string, CramEncoding> = {}
public dataSeriesEncoding: DataSeriesEncodingMap
private preservation: CramPreservationMap
private _endPosition: number
private _size: number

constructor(content: CramCompressionHeader) {
// Object.assign(this, content)
// interpret some of the preservation map tags for convenient use
this.readNamesIncluded = content.preservation.RN
this.APdelta = content.preservation.AP
Expand All @@ -109,9 +105,6 @@ export default class CramContainerCompressionScheme {
this.substitutionMatrix = parseSubstitutionMatrix(content.preservation.SM)
this.dataSeriesEncoding = content.dataSeriesEncoding
this.tagEncoding = content.tagEncoding
this.preservation = content.preservation
this._size = content._size
this._endPosition = content._endPosition
}

/**
Expand Down
31 changes: 21 additions & 10 deletions src/cramFile/container/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,29 @@ import { itf8Size, parseItem, tinyMemoize } from '../util'
import CramSlice from '../slice'
import CramContainerCompressionScheme from './compressionScheme'
import CramFile from '../file'
import { getSectionParsers } from '../sectionParsers'

export default class CramContainer {
constructor(
public file: CramFile,
public filePosition: number,
) {}

// memoize
getHeader() {
return this._readContainerHeader(this.filePosition)
}

// memoize
async getCompressionHeaderBlock() {
const containerHeader = await this.getHeader()

// if there are no records in the container, there will be no compression header
if (!containerHeader.numRecords) {
// if there are no records in the container, there will be no compression
// header
if (!containerHeader?.numRecords) {
return null
}
const sectionParsers = await this.file.getSectionParsers()
const { majorVersion } = await this.file.getDefinition()
const sectionParsers = getSectionParsers(majorVersion)

const block = await this.getFirstBlock()
if (block === undefined) {
return undefined
Expand All @@ -34,6 +36,7 @@ export default class CramContainer {
`invalid content type ${block.contentType} in what is supposed to be the compression header block`,
)
}

const content = parseItem(
block.content,
sectionParsers.cramCompressionHeader.parser,
Expand All @@ -48,16 +51,20 @@ export default class CramContainer {

async getFirstBlock() {
const containerHeader = await this.getHeader()
if (!containerHeader) {
return undefined
}
return this.file.readBlock(containerHeader._endPosition)
}

// parses the compression header data into a CramContainerCompressionScheme object
// memoize
// parses the compression header data into a CramContainerCompressionScheme
// object
async getCompressionScheme() {
const header = await this.getCompressionHeaderBlock()
if (!header) {
return undefined
}

return new CramContainerCompressionScheme(header.parsedContent)
}

Expand All @@ -68,23 +75,27 @@ export default class CramContainer {
}

async _readContainerHeader(position: number) {
const sectionParsers = await this.file.getSectionParsers()
const { majorVersion } = await this.file.getDefinition()
const sectionParsers = getSectionParsers(majorVersion)
const { cramContainerHeader1, cramContainerHeader2 } = sectionParsers
const { size: fileSize } = await this.file.stat()

if (position >= fileSize) {
console.warn(
`position:${position}>=fileSize:${fileSize} in cram container`,
)
return undefined
}

// parse the container header. do it in 2 pieces because you cannot tell
// how much to buffer until you read numLandmarks
const bytes1 = Buffer.allocUnsafe(cramContainerHeader1.maxLength)
await this.file.read(bytes1, 0, cramContainerHeader1.maxLength, position)
const header1 = parseItem(bytes1, cramContainerHeader1.parser) as any
const header1 = parseItem(bytes1, cramContainerHeader1.parser)
const numLandmarksSize = itf8Size(header1.numLandmarks)
if (position + header1.length >= fileSize) {
console.warn(
`${this.file}: container header at ${position} indicates that the container has length ${header1.length}, which extends beyond the length of the file. Skipping this container.`,
`container header at ${position} indicates that the container has length ${header1.length}, which extends beyond the length of the file. Skipping this container.`,
)
return undefined
}
Expand Down
71 changes: 28 additions & 43 deletions src/cramFile/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import { unzip } from '../unzip'
import crc32 from 'buffer-crc32'
import QuickLRU from 'quick-lru'
import htscodecs from '@jkbonfield/htscodecs'
import { Parser } from '@gmod/binary-parser'
// @ts-expect-error
import bzip2 from 'bzip2'
import { XzReadableStream } from 'xz-decompress'
Expand All @@ -11,7 +10,7 @@ import ransuncompress from '../rans'
import {
BlockHeader,
CompressionMethod,
cramFileDefinition as cramFileDefinitionParser,
cramFileDefinition,
getSectionParsers,
} from './sectionParsers'

Expand All @@ -32,25 +31,20 @@ function bufferToStream(buf: Buffer) {
})
}

//source:https://abdulapopoola.com/2019/01/20/check-endianness-with-javascript/
// source:https://abdulapopoola.com/2019/01/20/check-endianness-with-javascript/
function getEndianness() {
const uInt32 = new Uint32Array([0x11223344])
const uInt8 = new Uint8Array(uInt32.buffer)

if (uInt8[0] === 0x44) {
return 0 //little-endian
return 0 // little-endian
} else if (uInt8[0] === 0x11) {
return 1 //big-endian
return 1 // big-endian
} else {
return 2 //mixed-endian?
return 2 // mixed-endian?
}
}

// export type CramFileSource =
// | { url: string; path?: undefined; filehandle?: undefined }
// | { path: string; url?: undefined; filehandle?: undefined }
// | { filehandle: Filehandle; url?: undefined; path?: undefined }

export interface CramFileSource {
filehandle?: Filehandle
url?: string
Expand Down Expand Up @@ -110,15 +104,7 @@ export default class CramFile {
}

// can just read this object like a filehandle
read(
buffer: Buffer,
offset: number,
length: number,
position: number,
): Promise<{
bytesRead: number
buffer: Buffer
}> {
read(buffer: Buffer, offset: number, length: number, position: number) {
return this.file.read(buffer, offset, length, position)
}

Expand All @@ -129,10 +115,10 @@ export default class CramFile {

// memoized
async getDefinition() {
const headbytes = Buffer.allocUnsafe(cramFileDefinitionParser.maxLength)
await this.file.read(headbytes, 0, cramFileDefinitionParser.maxLength, 0)
const definition = cramFileDefinitionParser.parser.parse(headbytes)
.result as any
const { maxLength, parser } = cramFileDefinition()
const headbytes = Buffer.allocUnsafe(maxLength)
await this.file.read(headbytes, 0, maxLength, 0)
const definition = parser(headbytes).value
if (definition.majorVersion !== 2 && definition.majorVersion !== 3) {
throw new CramUnimplementedError(
`CRAM version ${definition.majorVersion} not supported`,
Expand Down Expand Up @@ -169,21 +155,16 @@ export default class CramFile {
return this.header
}

// memoize
async getSectionParsers() {
const { majorVersion } = await this.getDefinition()
return getSectionParsers(majorVersion)
}

async getContainerById(containerNumber: number) {
const sectionParsers = await this.getSectionParsers()
const { majorVersion } = await this.getDefinition()
const sectionParsers = getSectionParsers(majorVersion)
let position = sectionParsers.cramFileDefinition.maxLength
const { size: fileSize } = await this.file.stat()
const { cramContainerHeader1 } = sectionParsers

// skip with a series of reads to the proper container
let currentContainer
for (let i = 0; i <= containerNumber; i += 1) {
for (let i = 0; i <= containerNumber; i++) {
// if we are about to go off the end of the file
// and have not found that container, it does not exist
if (position + cramContainerHeader1.maxLength + 8 >= fileSize) {
Expand All @@ -197,13 +178,12 @@ export default class CramFile {
`container ${containerNumber} not found in file`,
)
}
// if this is the first container, read all the blocks in the
// container to determine its length, because we cannot trust
// the container header's given length due to a bug somewhere
// in htslib
// if this is the first container, read all the blocks in the container
// to determine its length, because we cannot trust the container
// header's given length due to a bug somewhere in htslib
if (i === 0) {
position = currentHeader._endPosition
for (let j = 0; j < currentHeader.numBlocks; j += 1) {
for (let j = 0; j < currentHeader.numBlocks; j++) {
const block = await this.readBlock(position)
if (block === undefined) {
return undefined
Expand Down Expand Up @@ -239,7 +219,8 @@ export default class CramFile {
* @returns {Promise[number]} the number of containers in the file
*/
async containerCount(): Promise<number | undefined> {
const sectionParsers = await this.getSectionParsers()
const { majorVersion } = await this.getDefinition()
const sectionParsers = getSectionParsers(majorVersion)
const { size: fileSize } = await this.file.stat()
const { cramContainerHeader1 } = sectionParsers

Expand All @@ -256,7 +237,7 @@ export default class CramFile {
// header's given length due to a bug somewhere in htslib
if (containerCount === 0) {
position = currentHeader._endPosition
for (let j = 0; j < currentHeader.numBlocks; j += 1) {
for (let j = 0; j < currentHeader.numBlocks; j++) {
const block = await this.readBlock(position)
if (block === undefined) {
return undefined
Expand All @@ -278,7 +259,8 @@ export default class CramFile {
}

async readBlockHeader(position: number) {
const sectionParsers = await this.getSectionParsers()
const { majorVersion } = await this.getDefinition()
const sectionParsers = getSectionParsers(majorVersion)
const { cramBlockHeader } = sectionParsers
const { size: fileSize } = await this.file.stat()

Expand All @@ -292,7 +274,10 @@ export default class CramFile {
}

async _parseSection<T>(
section: { parser: Parser<T>; maxLength: number },
section: {
maxLength: number
parser: (buffer: Buffer, offset: number) => { offset: number; value: T }
},
position: number,
size = section.maxLength,
preReadBuffer = undefined,
Expand Down Expand Up @@ -363,9 +348,9 @@ export default class CramFile {
}
}

async readBlock(position: number): Promise<CramFileBlock | undefined> {
async readBlock(position: number) {
const { majorVersion } = await this.getDefinition()
const sectionParsers = await this.getSectionParsers()
const sectionParsers = getSectionParsers(majorVersion)
const blockHeader = await this.readBlockHeader(position)
if (blockHeader === undefined) {
return undefined
Expand Down
2 changes: 1 addition & 1 deletion src/cramFile/record.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import Constants from './constants'
import CramContainerCompressionScheme from './container/compressionScheme'
import decodeRecord from './slice/decodeRecord'
import type decodeRecord from './slice/decodeRecord'

export interface RefRegion {
start: number
Expand Down
Loading

0 comments on commit 9c43f31

Please sign in to comment.