Skip to content

Commit

Permalink
Restore utf path
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin committed Nov 30, 2024
1 parent 7398b55 commit 196e962
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 28 deletions.
14 changes: 10 additions & 4 deletions eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,17 @@ export default tseslint.config(
},
],

'no-underscore-dangle': 0,
'no-console': [
'warn',
{
allow: ['error', 'warn'],
},
],
'no-underscore-dangle': 'off',
curly: 'error',
'@typescript-eslint/no-explicit-any': 0,
'@typescript-eslint/explicit-module-boundary-types': 0,
'@typescript-eslint/ban-ts-comment': 0,
'@typescript-eslint/no-explicit-any': 'off',
'@typescript-eslint/explicit-module-boundary-types': 'off',
'@typescript-eslint/ban-ts-comment': 'off',
semi: ['error', 'never'],
'unicorn/no-new-array': 'off',
'unicorn/no-empty-file': 'off',
Expand Down
2 changes: 0 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
export { default as TabixIndexedFile } from './tabixIndexedFile'
export { default as CSI } from './csi'
export { default as TBI } from './tbi'

console.log('wow3')
117 changes: 95 additions & 22 deletions src/tabixIndexedFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ import Chunk from './chunk'
import TBI from './tbi'
import CSI from './csi'

function isASCII(str: string) {
return /^[\u0000-\u007F]*$/.test(str)
function hasNonAscii(buffer: Buffer) {
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let i = 0; i < buffer.length; i++) {
if (buffer[i]! > 127) {
return true
}
}
return false
}

type GetLinesCallback = (line: string, fileOffset: number) => void
Expand All @@ -31,13 +37,9 @@ interface ReadChunk {
dpositions: number[]
}

function timeout(time: number) {
return new Promise(resolve => setTimeout(resolve, time))
}
export default class TabixIndexedFile {
private filehandle: GenericFilehandle
private index: IndexFile
private yieldTime: number
private renameRefSeq: (n: string) => string
private chunkCache: AbortablePromiseCache<Chunk, ReadChunk>

Expand All @@ -62,9 +64,6 @@ export default class TabixIndexedFile {
*
* @param {tbiUrl} [args.tbiUrl]
*
* @param {number} [args.yieldTime] yield to main thread after N milliseconds
* if reading features is taking a long time to avoid hanging main thread
*
* @param {function} [args.renameRefSeqs] optional function with sig `string
* => string` to transform reference sequence names for the purpose of
* indexing and querying. note that the data that is returned is not altered,
Expand All @@ -80,7 +79,6 @@ export default class TabixIndexedFile {
csiPath,
csiUrl,
csiFilehandle,
yieldTime = 500,
renameRefSeqs = n => n,
chunkCacheSize = 5 * 2 ** 20,
}: {
Expand All @@ -93,7 +91,6 @@ export default class TabixIndexedFile {
csiPath?: string
csiUrl?: string
csiFilehandle?: GenericFilehandle
yieldTime?: number
renameRefSeqs?: (n: string) => string
chunkCacheSize?: number
}) {
Expand Down Expand Up @@ -151,7 +148,6 @@ export default class TabixIndexedFile {
}

this.renameRefSeq = renameRefSeqs
this.yieldTime = yieldTime
this.chunkCache = new AbortablePromiseCache<Chunk, ReadChunk>({
cache: new LRU({ maxSize: Math.floor(chunkCacheSize / (1 << 16)) }),
fill: (args: Chunk, signal?: AbortSignal) =>
Expand Down Expand Up @@ -207,7 +203,6 @@ export default class TabixIndexedFile {
checkAbortSignal(signal)

// now go through each chunk and parse and filter the lines out of it
let last = Date.now()
for (const c of chunks) {
let previousStartCoordinate: number | undefined
const { buffer, cpositions, dpositions } = await this.chunkCache.get(
Expand All @@ -220,8 +215,9 @@ export default class TabixIndexedFile {
let blockStart = 0
let pos = 0

const str = decoder?.decode(buffer) ?? buffer.toString()
if (isASCII(str)) {
// fast path, Buffer is just ASCII chars, process directly
if (!hasNonAscii(buffer)) {
const str = decoder?.decode(buffer) ?? buffer.toString()
while (blockStart < str.length) {
const n = str.indexOf('\n', blockStart)
if (n === -1) {
Expand Down Expand Up @@ -257,6 +253,69 @@ export default class TabixIndexedFile {
}
previousStartCoordinate = startCoordinate

if (overlaps) {
callback(
line,
// cpositions[pos] refers to actual file offset of a bgzip block boundaries
//
// we multiply by (1 <<8) in order to make sure each block has a "unique"
// address space so that data in that block could never overlap
//
// then the blockStart-dpositions is an uncompressed file offset from
// that bgzip block boundary, and since the cpositions are multiplied by
// (1 << 8) these uncompressed offsets get a unique space
cpositions[pos]! * (1 << 8) +
(blockStart - dpositions[pos]!) +
c.minv.dataPosition +
1,
)
} else if (startCoordinate !== undefined && startCoordinate >= end) {
// the lines were overlapping the region, but now have stopped, so
// we must be at the end of the relevant data and we can stop
// processing data now
return
}
blockStart = n + 1
}
} else {
// slower path, has UTF chars, need to process Buffer line by line
// to maintain byte offsets
while (blockStart < buffer.length) {
const n = buffer.indexOf('\n', blockStart)
if (n === -1) {
break
}
const b = buffer.slice(blockStart, n)
const line = decoder?.decode(b) ?? b.toString()

// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (dpositions) {
while (blockStart + c.minv.dataPosition >= dpositions[pos++]!) {}
pos--
}

// filter the line for whether it is within the requested range
const { startCoordinate, overlaps } = this.checkLine(
metadata,
refName,
start,
end,
line,
)

// do a small check just to make sure that the lines are really
// sorted by start coordinate
if (
previousStartCoordinate !== undefined &&
startCoordinate !== undefined &&
previousStartCoordinate > startCoordinate
) {
throw new Error(
`Lines not sorted by start coordinate (${previousStartCoordinate} > ${startCoordinate}), this file is not usable with Tabix.`,
)
}
previousStartCoordinate = startCoordinate

if (overlaps) {
callback(
line,
Expand Down Expand Up @@ -401,15 +460,18 @@ export default class TabixIndexedFile {
let currentColumnStart = 0
let refSeq = ''
let startCoordinate = -Infinity
for (let i = 0; i < line.length + 1; i += 1) {
if (line[i] === '\t' || i === line.length) {
const l = line.length
for (let i = 0; i < l; i++) {
if (line[i] === '\t' || i === l) {
if (currentColumnNumber === ref) {
if (
regionRefName !== undefined &&
this.renameRefSeq(line.slice(currentColumnStart, i)) !==
regionRefName
) {
return { overlaps: false }
return {
overlaps: false,
}
}
} else if (currentColumnNumber === start) {
startCoordinate = parseInt(line.slice(currentColumnStart, i), 10)
Expand All @@ -418,12 +480,18 @@ export default class TabixIndexedFile {
startCoordinate -= 1
}
if (startCoordinate >= regionEnd) {
return { startCoordinate, overlaps: false }
return {
startCoordinate,
overlaps: false,
}
}
if (end === 0 || end === start) {
// if we have no end, we assume the feature is 1 bp long
if (startCoordinate + 1 <= regionStart) {
return { startCoordinate, overlaps: false }
return {
startCoordinate,
overlaps: false,
}
}
}
} else if (format === 'VCF' && currentColumnNumber === 4) {
Expand All @@ -439,7 +507,9 @@ export default class TabixIndexedFile {
)
: Number.parseInt(line.slice(currentColumnStart, i), 10)
if (endCoordinate <= regionStart) {
return { overlaps: false }
return {
overlaps: false,
}
}
}
currentColumnStart = i + 1
Expand All @@ -449,7 +519,10 @@ export default class TabixIndexedFile {
}
}
}
return { startCoordinate, overlaps: true }
return {
startCoordinate,
overlaps: true,
}
}

_getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
Expand Down

0 comments on commit 196e962

Please sign in to comment.