Skip to content

Commit

Permalink
Refactor archiving to go through single determinstic function
Browse files Browse the repository at this point in the history
We had three places that were interfacing with the archiver package, and
they all need to do basically the same thing--create a deterministic zip
file. If the zip file is nondeterministic, then downstream problems with
caching will arise. Since creating a deterministic zip requires some
special handling, I decided that it would be best to refactor this code
to all run through the same function that guarantees a deterministic
output.

While working on this I noticed that the remoteRunner was using
archiver's directory function, which I believe can by nondeterministic,
so this should help to stabilize those zip files.
  • Loading branch information
lencioni committed Feb 13, 2025
1 parent 8fdb4f4 commit 32c7ae8
Show file tree
Hide file tree
Showing 8 changed files with 287 additions and 236 deletions.
118 changes: 12 additions & 106 deletions src/createStaticPackage.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
import { Writable } from 'stream';
import crypto from 'crypto';
import path from 'path';
import fs from 'fs';

import Archiver from 'archiver';

import validateArchive from './validateArchive';
import deterministicArchive from './deterministicArchive';

// We're setting the creation date to the same for all files so that the zip
// packages created for the same content ends up having the same fingerprint.
Expand All @@ -24,107 +17,20 @@ const IFRAME_CONTENT = `
`;

/**
* Gets all files in a directory and all of its subdirectories
*
* The returned value is sorted for deterministic output.
* Creates a static package of the given files
*
* @param {string} dir
* @returns {Array<string>}
* @param {Object} options
* @param {string} options.tmpdir
* @param {string[]} options.publicFolders
* @returns {Promise<{buffer: Buffer, hash: string}>}
*/
function getFilesRecursive(dir) {
const files = [];
const dirs = [dir];

while (dirs.length > 0) {
const currentDir = dirs.shift();
// Node 18 adds `recursive` option to `fs.readdirSync`, which would
// make this function simpler.
const dirents = fs.readdirSync(currentDir, { withFileTypes: true });

for (const dirent of dirents) {
const res = path.resolve(currentDir, dirent.name);

if (dirent.isDirectory()) {
// This is a directory, so we are going to add it to the list of directories to recurse into.
dirs.push(res);
files.push(res);
} else {
files.push(res);
}
}
}

// Sort the files for deterministic output. This is important so that
// the archive hash is the same.
files.sort((a, b) => a.localeCompare(b));
return files;
}

export default function createStaticPackage({ tmpdir, publicFolders }) {
return new Promise((resolve, reject) => {
const archive = new Archiver('zip', {
// Concurrency in the stat queue leads to non-deterministic output.
// https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2253139948
statConcurrency: 1,
zlib: { level: 6 },
});

const stream = new Writable();
const data = [];

stream._write = (chunk, _enc, done) => {
data.push(...chunk);
done();
};

const entries = [];
archive.on('entry', (entry) => {
entries.push(entry);
});

stream.on('finish', () => {
validateArchive(archive.pointer(), entries);
const buffer = Buffer.from(data);
const hash = crypto.createHash('md5').update(buffer).digest('hex');

resolve({ buffer, hash });
});
archive.pipe(stream);

// We can't use archive.directory() here because it is not deterministic.
// https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2252938075
const tmpdirFiles = getFilesRecursive(tmpdir);
for (const file of tmpdirFiles) {
archive.file(file, {
name: file.slice(tmpdir.length + 1),
prefix: '',
date: FILE_CREATION_DATE,
});
}

publicFolders.forEach((folder) => {
if (folder === tmpdir) {
// ignore, since this is handled separately
} else if (folder.startsWith(process.cwd())) {
const folderFiles = getFilesRecursive(folder);
for (const file of folderFiles) {
archive.file(file, {
name: file.slice(folder.length + 1),
prefix: '',
date: FILE_CREATION_DATE,
});
}
}
});

archive.append(IFRAME_CONTENT, {
export default async function createStaticPackage({ tmpdir, publicFolders }) {
return deterministicArchive(Array.from(new Set([tmpdir, ...publicFolders])), [
{
name: 'iframe.html',
date: FILE_CREATION_DATE,
});

archive.on('error', reject);
archive.finalize();
});
content: IFRAME_CONTENT,
},
]);
}

export { FILE_CREATION_DATE };
143 changes: 143 additions & 0 deletions src/deterministicArchive.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import { Writable } from 'stream';
import crypto from 'crypto';
import fs from 'fs';
import path from 'path';

import Archiver from 'archiver';

import validateArchive from './validateArchive';

// We're setting the creation date to the same for all files so that the zip
// packages created for the same content ends up having the same fingerprint.
const FILE_CREATION_DATE = new Date('Fri Feb 08 2019 13:31:55 GMT+0100 (CET)');

/**
* Resolves all files in a directory and all of its subdirectories
*
* @param {string} dirOrFile
* @returns {Promise<Array<{name: string, path: string}>>}
*/
async function resolveFilesRecursiveForDir(dirOrFile) {
const isDir = (await fs.promises.lstat(dirOrFile)).isDirectory();

if (isDir) {
const files = await fs.promises.readdir(dirOrFile, {
withFileTypes: true,
recursive: true,
});

return files
.filter((dirent) => dirent.isFile())
.map((dirent) => {
const fullPath = path.resolve(dirent.path, dirent.name);
return {
name: fullPath.slice(dirOrFile.length + 1),
path: fullPath,
};
});
}

return [
{
name: path.basename(dirOrFile),
path: path.resolve(dirOrFile),
},
];
}

/**
* Resolves all files in all directories recursively
*
* @param {...string} dirsAndFiles
* @returns {Promise<Array<{name: string, path: string}>>}
*/
async function resolveFilesRecursive(...dirsAndFiles) {
const files = await Promise.all(
dirsAndFiles.filter(Boolean).map(resolveFilesRecursiveForDir),
);

return files.flat();
}

/**
* Creates a deterministic archive of the given files
*
* @param {string[]} dirsAndFiles
* @param {{name: string, content: string}[]} contentToArchive
* @returns {Promise<{buffer: Buffer, hash: string}>}
*/
export default async function deterministicArchive(
dirsAndFiles,
contentToArchive = [],
) {
const uniqueDirsAndFiles = Array.from(new Set(dirsAndFiles));

// Sort by path to make the output deterministic
const filesToArchiveSorted = (
await resolveFilesRecursive(...uniqueDirsAndFiles)
).sort((a, b) => a.path.localeCompare(b.path));

const contentToArchiveSorted = contentToArchive.sort((a, b) =>
a.name.localeCompare(b.name),
);

return new Promise((resolve, reject) => {
const archive = new Archiver('zip', {
// Concurrency in the stat queue leads to non-deterministic output.
// https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2253139948
statConcurrency: 1,
zlib: { level: 6 },
});

const stream = new Writable();
const data = [];

stream._write = (chunk, _enc, done) => {
data.push(...chunk);
done();
};

const entries = [];
archive.on('entry', (entry) => {
entries.push(entry);
});

stream.on('finish', () => {
validateArchive(archive.pointer(), entries);
const buffer = Buffer.from(data);
const hash = crypto.createHash('md5').update(buffer).digest('hex');

resolve({ buffer, hash });
});
archive.pipe(stream);

const seenFiles = new Set();

// We can't use archive.directory() here because it is not deterministic.
// https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2252938075
for (const file of filesToArchiveSorted) {
if (!seenFiles.has(file.name)) {
archive.file(file.path, {
name: file.name,
prefix: '',
date: FILE_CREATION_DATE,
});
seenFiles.add(file.name);
}
}

for (const file of contentToArchiveSorted) {
if (!seenFiles.has(file.name)) {
archive.append(file.content, {
name: file.name,
prefix: '',
date: FILE_CREATION_DATE,
});
seenFiles.add(file.name);
}
}

archive.on('error', reject);
archive.finalize();
});
}
1 change: 1 addition & 0 deletions src/domRunner.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ async function uploadStaticPackage({
tmpdir,
publicFolders,
});

const assetsPath = await uploadAssets(buffer, {
apiKey,
apiSecret,
Expand Down
Loading

0 comments on commit 32c7ae8

Please sign in to comment.