-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine.js
executable file
·223 lines (194 loc) · 7.67 KB
/
combine.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const yargs = require('yargs/yargs');
const { hideBin } = require('yargs/helpers');
/**
* Normalizes content by trimming and removing excessive spaces and newlines.
* @param {string} text - The content text to normalize.
* @returns {string} - Normalized content.
*/
const normalizeContent = (text) => {
return text.trim().replace(/\s+/g, ' ');
};
/**
* Checks if the content matches any of the skip filters.
* @param {string} content - The content to check.
* @param {Array} filters - The array of filters (strings or regex patterns).
* @returns {boolean} - True if the content matches any filter, false otherwise.
*/
const shouldSkipContent = (content, filters) => {
return filters.some(filter => {
if (typeof filter === 'string') {
return content.includes(filter);
} else if (filter instanceof RegExp) {
return filter.test(content);
}
return false;
});
};
/**
* Parses VTT file content and returns an array of timestamped content.
* Logs deduplication and filter information based on options.
* @param {string} fileContent - The content of the VTT file.
* @param {string} dedupe - The deduplication strategy ("false", "consecutive", "unique").
* @param {Array} skipFilters - Array of strings or regex patterns to filter out content.
* @param {string} filename - The name of the VTT file being parsed.
* @returns {Array} Parsed VTT content as an array of arrays.
*/
const parseVTT = (fileContent, dedupe, skipFilters, filename) => {
const timeRegex = /(\d{2}:\d{2}:\d{2}\.\d{3})\s-->\s(\d{2}:\d{2}:\d{2}\.\d{3})/;
const result = [];
const entries = fileContent.split(timeRegex).slice(1); // Split on timestamps and remove the first empty element
let lastContent = null;
let uniqueContents = new Set();
let skippedCount = 0;
let duplicateCount = 0;
let initialEntries = 0;
console.log(`\nProcessing ${filename} with the following options:`);
if (dedupe !== "false") console.log(` - Deduplication strategy: ${dedupe}`);
if (skipFilters.length > 0) console.log(` - Skip filters applied: ${skipFilters}`);
for (let i = 0; i < entries.length; i += 3) {
const startTime = entries[i];
const endTime = entries[i + 1];
const content = normalizeContent(entries[i + 2]);
initialEntries++;
// Skip content if it matches any of the skip filters
if (shouldSkipContent(content, skipFilters)) {
skippedCount++;
continue;
}
// Deduplication logic
if (dedupe === "consecutive" && lastContent !== null && content === lastContent) {
duplicateCount++;
continue;
} else if (dedupe === "unique" && uniqueContents.has(content)) {
duplicateCount++;
continue;
}
if (content) {
result.push([`${startTime} --> ${endTime}`, content]);
lastContent = content;
uniqueContents.add(content);
}
}
console.log(` - ${initialEntries} entries processed`);
if (dedupe !== "false") console.log(` - ${duplicateCount} duplicates removed`);
if (skipFilters.length > 0) console.log(` - ${skippedCount} skipped due to filters`);
console.log(` - ${result.length} final messages detected`);
return result;
};
/**
* Combines transcripts from multiple VTT files and saves to output files.
* Logs details of the combination process.
* @param {Array} transcripts - Array of transcript details.
* @param {string} outputFilePath - Path to the output file.
* @param {boolean} timestamped - Whether to include timestamps in the output.
* @param {number} chunks - Number of parts to split the combined transcript into.
*/
const combineTranscripts = (transcripts, outputFilePath, timestamped, chunks) => {
const combinedLines = [];
let summary = 'Summary:\n';
transcripts.forEach(({ name, role, character, description, filename }) => {
summary += `${name} - ${role} - ${character} - ${description}\n`;
const filePath = path.join(__dirname, filename);
const fileContent = fs.readFileSync(filePath, 'utf8');
const parsedVTT = parseVTT(fileContent, argv.dedupe, skipFilters, filename);
parsedVTT.forEach(([timestamp, content]) => {
combinedLines.push({ timestamp, character, content });
});
});
// Sort the lines based on timestamp
combinedLines.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
// Calculate chunk size
const chunkLength = Math.ceil(combinedLines.length / chunks);
const outputFiles = [];
for (let i = 0; i < chunks; i++) {
const chunk = combinedLines.slice(i * chunkLength, (i + 1) * chunkLength);
// Combine sorted lines into a single string
const combinedText = chunk.reduce((acc, { timestamp, character, content }) => {
if (timestamped) {
return `${acc}[${timestamp}] ${character}: ${content}\n`;
} else {
return `${acc}${character}: ${content}\n`;
}
}, '');
const outputFilePathChunk = outputFilePath.replace(/(\.[\w\d_-]+)$/i, `-${i + 1}$1`);
let fileHeader = `${summary}\n`;
if (chunks > 1) {
fileHeader += `FILE ${i + 1} of ${chunks}\n\n`;
}
fileHeader += `TRANSCRIPT:\n`;
fs.writeFileSync(outputFilePathChunk, `${fileHeader}${combinedText}`, 'utf8');
outputFiles.push(outputFilePathChunk);
}
if (chunks > 1) {
console.log(`\nTranscripts combined, chunked into ${chunks} files and saved to:`);
outputFiles.forEach(file => console.log(` - ${file}`));
} else {
console.log(`\nTranscripts combined and saved to ${outputFiles[0]}`);
}
};
// Parse command-line arguments
const argv = yargs(hideBin(process.argv))
.option('dedupe', {
type: 'string',
description: 'Deduplication strategy ("false", "consecutive", "unique")',
default: 'false',
choices: ['false', 'consecutive', 'unique'],
})
.option('skip-filter', {
type: 'array',
description: 'Skip messages containing these strings or matching these regex patterns',
default: [],
coerce: (arg) => {
return arg.map(filter => {
if (filter.startsWith('/') && filter.endsWith('/')) {
return new RegExp(filter.slice(1, -1));
}
return filter;
});
}
})
.option('timestamped', {
type: 'boolean',
description: 'Include timestamps in the output',
default: true,
})
.option('output', {
type: 'string',
description: 'Path to the output file',
demandOption: true,
})
.option('chunks', {
type: 'number',
description: 'Number of parts to split the combined transcript into',
default: 1,
})
.array('player-name')
.array('role')
.array('character-name')
.array('character-description')
.array('transcript')
.demandOption(['player-name', 'role', 'character-name', 'character-description', 'transcript'])
.argv;
// Validate that all arrays have the same length
const { 'player-name': playerNames, role, 'character-name': characterNames, 'character-description': characterDescriptions, transcript, dedupe, 'skip-filter': skipFilters, timestamped, chunks } = argv;
if (![playerNames, role, characterNames, characterDescriptions, transcript].every(arr => arr.length === playerNames.length)) {
console.error('Error: All input arrays (player-name, role, character-name, character-description, transcript) must have the same length.');
process.exit(1);
}
// Extract transcript details from arguments
const transcripts = playerNames.map((_, index) => ({
name: playerNames[index],
role: role[index],
character: characterNames[index],
description: characterDescriptions[index],
filename: transcript[index],
}));
// Combine the transcripts and save to file
try {
combineTranscripts(transcripts, argv.output, timestamped, chunks);
} catch (error) {
console.error(`Error combining transcripts: ${error.message}`);
}