From 32c7ae8bf026b50e683f94cf24fd18891dee8f9f Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Thu, 13 Feb 2025 10:41:54 -0600 Subject: [PATCH] Refactor archiving to go through single determinstic function We had three places that were interfacing with the archiver package, and they all need to do basically the same thing--create a deterministic zip file. If the zip file is nondeterministic, then downstream problems with caching will arise. Since creating a deterministic zip requires some special handling, I decided that it would be best to refactor this code to all run through the same function that guarantees a deterministic output. While working on this I noticed that the remoteRunner was using archiver's directory function, which I believe can by nondeterministic, so this should help to stabilize those zip files. --- src/createStaticPackage.js | 118 +++--------------------- src/deterministicArchive.js | 143 ++++++++++++++++++++++++++++++ src/domRunner.js | 1 + src/prepareAssetsPackage.js | 100 ++++++++------------- src/remoteRunner.js | 61 ++----------- test/deterministicArchive-test.js | 79 +++++++++++++++++ test/integrations/react-test.js | 2 - test/prepareAssetsPackage-test.js | 19 ++-- 8 files changed, 287 insertions(+), 236 deletions(-) create mode 100644 src/deterministicArchive.js create mode 100644 test/deterministicArchive-test.js diff --git a/src/createStaticPackage.js b/src/createStaticPackage.js index 1b2bc19..47fcc25 100644 --- a/src/createStaticPackage.js +++ b/src/createStaticPackage.js @@ -1,11 +1,4 @@ -import { Writable } from 'stream'; -import crypto from 'crypto'; -import path from 'path'; -import fs from 'fs'; - -import Archiver from 'archiver'; - -import validateArchive from './validateArchive'; +import deterministicArchive from './deterministicArchive'; // We're setting the creation date to the same for all files so that the zip // packages created for the same content ends up having the same fingerprint. @@ -24,107 +17,20 @@ const IFRAME_CONTENT = ` `; /** - * Gets all files in a directory and all of its subdirectories - * - * The returned value is sorted for deterministic output. + * Creates a static package of the given files * - * @param {string} dir - * @returns {Array} + * @param {Object} options + * @param {string} options.tmpdir + * @param {string[]} options.publicFolders + * @returns {Promise<{buffer: Buffer, hash: string}>} */ -function getFilesRecursive(dir) { - const files = []; - const dirs = [dir]; - - while (dirs.length > 0) { - const currentDir = dirs.shift(); - // Node 18 adds `recursive` option to `fs.readdirSync`, which would - // make this function simpler. - const dirents = fs.readdirSync(currentDir, { withFileTypes: true }); - - for (const dirent of dirents) { - const res = path.resolve(currentDir, dirent.name); - - if (dirent.isDirectory()) { - // This is a directory, so we are going to add it to the list of directories to recurse into. - dirs.push(res); - files.push(res); - } else { - files.push(res); - } - } - } - - // Sort the files for deterministic output. This is important so that - // the archive hash is the same. - files.sort((a, b) => a.localeCompare(b)); - return files; -} - -export default function createStaticPackage({ tmpdir, publicFolders }) { - return new Promise((resolve, reject) => { - const archive = new Archiver('zip', { - // Concurrency in the stat queue leads to non-deterministic output. - // https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2253139948 - statConcurrency: 1, - zlib: { level: 6 }, - }); - - const stream = new Writable(); - const data = []; - - stream._write = (chunk, _enc, done) => { - data.push(...chunk); - done(); - }; - - const entries = []; - archive.on('entry', (entry) => { - entries.push(entry); - }); - - stream.on('finish', () => { - validateArchive(archive.pointer(), entries); - const buffer = Buffer.from(data); - const hash = crypto.createHash('md5').update(buffer).digest('hex'); - - resolve({ buffer, hash }); - }); - archive.pipe(stream); - - // We can't use archive.directory() here because it is not deterministic. - // https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2252938075 - const tmpdirFiles = getFilesRecursive(tmpdir); - for (const file of tmpdirFiles) { - archive.file(file, { - name: file.slice(tmpdir.length + 1), - prefix: '', - date: FILE_CREATION_DATE, - }); - } - - publicFolders.forEach((folder) => { - if (folder === tmpdir) { - // ignore, since this is handled separately - } else if (folder.startsWith(process.cwd())) { - const folderFiles = getFilesRecursive(folder); - for (const file of folderFiles) { - archive.file(file, { - name: file.slice(folder.length + 1), - prefix: '', - date: FILE_CREATION_DATE, - }); - } - } - }); - - archive.append(IFRAME_CONTENT, { +export default async function createStaticPackage({ tmpdir, publicFolders }) { + return deterministicArchive(Array.from(new Set([tmpdir, ...publicFolders])), [ + { name: 'iframe.html', - date: FILE_CREATION_DATE, - }); - - archive.on('error', reject); - archive.finalize(); - }); + content: IFRAME_CONTENT, + }, + ]); } export { FILE_CREATION_DATE }; diff --git a/src/deterministicArchive.js b/src/deterministicArchive.js new file mode 100644 index 0000000..d57d7ee --- /dev/null +++ b/src/deterministicArchive.js @@ -0,0 +1,143 @@ +import { Writable } from 'stream'; +import crypto from 'crypto'; +import fs from 'fs'; +import path from 'path'; + +import Archiver from 'archiver'; + +import validateArchive from './validateArchive'; + +// We're setting the creation date to the same for all files so that the zip +// packages created for the same content ends up having the same fingerprint. +const FILE_CREATION_DATE = new Date('Fri Feb 08 2019 13:31:55 GMT+0100 (CET)'); + +/** + * Resolves all files in a directory and all of its subdirectories + * + * @param {string} dirOrFile + * @returns {Promise>} + */ +async function resolveFilesRecursiveForDir(dirOrFile) { + const isDir = (await fs.promises.lstat(dirOrFile)).isDirectory(); + + if (isDir) { + const files = await fs.promises.readdir(dirOrFile, { + withFileTypes: true, + recursive: true, + }); + + return files + .filter((dirent) => dirent.isFile()) + .map((dirent) => { + const fullPath = path.resolve(dirent.path, dirent.name); + return { + name: fullPath.slice(dirOrFile.length + 1), + path: fullPath, + }; + }); + } + + return [ + { + name: path.basename(dirOrFile), + path: path.resolve(dirOrFile), + }, + ]; +} + +/** + * Resolves all files in all directories recursively + * + * @param {...string} dirsAndFiles + * @returns {Promise>} + */ +async function resolveFilesRecursive(...dirsAndFiles) { + const files = await Promise.all( + dirsAndFiles.filter(Boolean).map(resolveFilesRecursiveForDir), + ); + + return files.flat(); +} + +/** + * Creates a deterministic archive of the given files + * + * @param {string[]} dirsAndFiles + * @param {{name: string, content: string}[]} contentToArchive + * @returns {Promise<{buffer: Buffer, hash: string}>} + */ +export default async function deterministicArchive( + dirsAndFiles, + contentToArchive = [], +) { + const uniqueDirsAndFiles = Array.from(new Set(dirsAndFiles)); + + // Sort by path to make the output deterministic + const filesToArchiveSorted = ( + await resolveFilesRecursive(...uniqueDirsAndFiles) + ).sort((a, b) => a.path.localeCompare(b.path)); + + const contentToArchiveSorted = contentToArchive.sort((a, b) => + a.name.localeCompare(b.name), + ); + + return new Promise((resolve, reject) => { + const archive = new Archiver('zip', { + // Concurrency in the stat queue leads to non-deterministic output. + // https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2253139948 + statConcurrency: 1, + zlib: { level: 6 }, + }); + + const stream = new Writable(); + const data = []; + + stream._write = (chunk, _enc, done) => { + data.push(...chunk); + done(); + }; + + const entries = []; + archive.on('entry', (entry) => { + entries.push(entry); + }); + + stream.on('finish', () => { + validateArchive(archive.pointer(), entries); + const buffer = Buffer.from(data); + const hash = crypto.createHash('md5').update(buffer).digest('hex'); + + resolve({ buffer, hash }); + }); + archive.pipe(stream); + + const seenFiles = new Set(); + + // We can't use archive.directory() here because it is not deterministic. + // https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2252938075 + for (const file of filesToArchiveSorted) { + if (!seenFiles.has(file.name)) { + archive.file(file.path, { + name: file.name, + prefix: '', + date: FILE_CREATION_DATE, + }); + seenFiles.add(file.name); + } + } + + for (const file of contentToArchiveSorted) { + if (!seenFiles.has(file.name)) { + archive.append(file.content, { + name: file.name, + prefix: '', + date: FILE_CREATION_DATE, + }); + seenFiles.add(file.name); + } + } + + archive.on('error', reject); + archive.finalize(); + }); +} diff --git a/src/domRunner.js b/src/domRunner.js index feb37e9..a39269c 100644 --- a/src/domRunner.js +++ b/src/domRunner.js @@ -110,6 +110,7 @@ async function uploadStaticPackage({ tmpdir, publicFolders, }); + const assetsPath = await uploadAssets(buffer, { apiKey, apiSecret, diff --git a/src/prepareAssetsPackage.js b/src/prepareAssetsPackage.js index 92a99ba..ddcd30a 100644 --- a/src/prepareAssetsPackage.js +++ b/src/prepareAssetsPackage.js @@ -1,81 +1,51 @@ -import { Writable } from 'stream'; -import crypto from 'crypto'; import fs from 'fs'; import path from 'path'; -import Archiver from 'archiver'; - import findCSSAssetPaths from './findCSSAssetPaths'; - -// We're setting the creation date to the same for all files so that the zip -// packages created for the same content ends up having the same fingerprint. -const FILE_CREATION_DATE = new Date('Fri Feb 08 2019 13:31:55 GMT+0100 (CET)'); +import deterministicArchive from './deterministicArchive'; function makePackage({ paths, publicFolders }) { - return new Promise((resolve, reject) => { - const archive = new Archiver('zip', { - // Concurrency in the stat queue leads to non-deterministic output. - // https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2253139948 - statConcurrency: 1, - zlib: { level: 6 }, - }); + const publicFoldersResolved = publicFolders.map((folder) => + folder.startsWith('/') ? folder : path.resolve(process.cwd(), folder), + ); - const stream = new Writable(); - const data = []; - - stream._write = (chunk, enc, done) => { - data.push(...chunk); - done(); - }; - stream.on('finish', () => { - const buffer = Buffer.from(data); - const hash = crypto.createHash('md5').update(buffer).digest('hex'); - resolve({ buffer, hash }); - }); - archive.pipe(stream); + const files = Object.entries(paths).map(([assetPath, resolvePath]) => { + if (!resolvePath) { + throw new Error(`Unable to resolve asset path: ${assetPath}`); + } - Object.keys(paths).forEach((assetPath) => { - const resolvePath = paths[assetPath]; + for (const publicFolder of publicFoldersResolved) { + const fullPath = path.join(publicFolder, assetPath); + if (fs.existsSync(fullPath)) { + return { + content: fs.createReadStream(fullPath), + name: resolvePath, + }; + } - if (!resolvePath) { - throw new Error(`Unable to resolve asset path: ${assetPath}`); + // findCSSAssetPaths will sometimes return absolute paths + if (assetPath.startsWith(publicFolder) && fs.existsSync(assetPath)) { + return { + content: fs.createReadStream(assetPath), + name: resolvePath, + }; } - for (const publicFolder of publicFolders) { - const folder = publicFolder.startsWith('/') - ? publicFolder - : path.resolve(process.cwd(), publicFolder); - const fullPath = path.join(folder, assetPath); - if (fs.existsSync(fullPath)) { - archive.append(fs.createReadStream(fullPath), { - name: resolvePath, - date: FILE_CREATION_DATE, - }); - return; - } - // findCSSAssetPaths will sometimes return absolute paths - if (assetPath.startsWith(folder) && fs.existsSync(assetPath)) { - archive.append(fs.createReadStream(assetPath), { - name: resolvePath, - date: FILE_CREATION_DATE, - }); - return; - } - // as a last fallback, check if the resolve path exists in the public folder - const fullResolvePath = path.join(folder, resolvePath); - if (fs.existsSync(fullResolvePath)) { - archive.append(fs.createReadStream(fullResolvePath), { - name: resolvePath, - date: FILE_CREATION_DATE, - }); - return; - } + // as a last fallback, check if the resolve path exists in the public + // folder + const fullResolvePath = path.join(publicFolder, resolvePath); + if (fs.existsSync(fullResolvePath)) { + return { + content: fs.createReadStream(fullResolvePath), + name: resolvePath, + }; } - }); + } - archive.on('error', reject); - archive.finalize(); + return null; }); + + return deterministicArchive([], files.filter(Boolean)); } export default function prepareAssetsPackage({ @@ -84,11 +54,13 @@ export default function prepareAssetsPackage({ publicFolders, }) { const paths = {}; + globalCSS.forEach(({ css, source }) => { findCSSAssetPaths({ css, source }).forEach(({ assetPath, resolvePath }) => { paths[assetPath] = resolvePath; }); }); + snapPayloads.forEach(({ assetPaths }) => { assetPaths.forEach((assetPath) => { paths[assetPath] = assetPath; diff --git a/src/remoteRunner.js b/src/remoteRunner.js index fed7f7e..e76bd1b 100644 --- a/src/remoteRunner.js +++ b/src/remoteRunner.js @@ -1,53 +1,16 @@ -import crypto from 'crypto'; -import fs from 'fs'; -import os from 'os'; -import path from 'path'; - -import Archiver from 'archiver'; - -import { FILE_CREATION_DATE } from './createStaticPackage'; import Logger, { logTag } from './Logger'; import constructReport from './constructReport'; import createHash from './createHash'; import ensureTarget from './ensureTarget'; import loadCSSFile from './loadCSSFile'; import uploadAssets from './uploadAssets'; -import validateArchive from './validateArchive'; - -function staticDirToZipFile(dir) { - return new Promise((resolve, reject) => { - const archive = new Archiver('zip', { - // Concurrency in the stat queue leads to non-deterministic output. - // https://github.com/archiverjs/node-archiver/issues/383#issuecomment-2253139948 - statConcurrency: 1, - zlib: { level: 6 }, - }); - - const rnd = crypto.randomBytes(4).toString('hex'); - const pathToZipFile = path.join(os.tmpdir(), `happo-static-${rnd}.zip`); - const output = fs.createWriteStream(pathToZipFile); - const entries = []; - - archive.on('entry', (entry) => { - entries.push(entry); - }); - - output.on('close', async () => { - validateArchive(archive.pointer(), entries); - resolve(pathToZipFile); - }); - archive.pipe(output); - archive.directory(dir, false, { date: FILE_CREATION_DATE }); - archive.on('error', reject); - archive.finalize(); - }); -} +import deterministicArchive from './deterministicArchive'; async function resolvePackageData(staticPackage) { if (typeof staticPackage === 'string') { // legacy plugins const buffer = Buffer.from(staticPackage, 'base64'); - return { value: buffer, hash: createHash(buffer) }; + return { buffer, hash: createHash(buffer) }; } if (!staticPackage.path) { @@ -56,20 +19,8 @@ async function resolvePackageData(staticPackage) { ); } - const file = await staticDirToZipFile(staticPackage.path); - - const readStream = fs.createReadStream(file); - const hash = await new Promise((resolve) => { - const hashCreator = crypto.createHash('md5'); - readStream.pipe(hashCreator); - hashCreator.setEncoding('hex'); - readStream.on('end', () => { - hashCreator.end(); - resolve(hashCreator.read()); - }); - }); - readStream.destroy(); - return { value: fs.createReadStream(file), hash }; + const archive = await deterministicArchive([staticPackage.path]); + return archive; } export default async function remoteRunner( @@ -83,8 +34,8 @@ export default async function remoteRunner( logger.info(`${logTag(project)}Generating static package...`); const staticPackage = await generateStaticPackage(); - const { value, hash } = await resolvePackageData(staticPackage); - const staticPackagePath = await uploadAssets(value, { + const { buffer, hash } = await resolvePackageData(staticPackage); + const staticPackagePath = await uploadAssets(buffer, { hash, endpoint, apiSecret, diff --git a/test/deterministicArchive-test.js b/test/deterministicArchive-test.js new file mode 100644 index 0000000..9028b00 --- /dev/null +++ b/test/deterministicArchive-test.js @@ -0,0 +1,79 @@ +import path from 'path'; + +import AdmZip from 'adm-zip'; + +import deterministicArchive from '../src/deterministicArchive'; + +const publicFolders = [ + __dirname, // absolute path + 'test/integrations/assets', // relative +]; + +const tmpdir = path.resolve(__dirname, 'assets'); + +it('creates a package', async () => { + const result = await deterministicArchive([tmpdir, ...publicFolders]); + + expect(result.buffer).not.toBe(undefined); + expect(result.hash).not.toBe(undefined); +}); + +it('creates deterministic hashes when content has not changed', async () => { + const promises = Array.from({ length: 20 }).map(() => + deterministicArchive([tmpdir, ...publicFolders]), + ); + const results = await Promise.all(promises); + const hashes = results.map(({ hash }) => hash); + + expect(hashes).toHaveLength(20); + expect(hashes[0]).not.toBeUndefined(); + expect(typeof hashes[0]).toBe('string'); + expect(hashes[0].length).toBeGreaterThan(0); + expect(hashes.every((hash) => hash === hashes[0])).toBe(true); +}); + +it('picks out the right files', async () => { + const { buffer } = await deterministicArchive([tmpdir, ...publicFolders]); + + const zip = new AdmZip(buffer); + expect( + zip + .getEntries() + .map(({ entryName }) => entryName) + .includes('solid-white.png'), + ).toBeTruthy(); +}); + +it('does not include duplicate files', async () => { + const resultNormal = await deterministicArchive([tmpdir, ...publicFolders]); + const resultWithPossibleDuplicates = await deterministicArchive([ + tmpdir, + tmpdir, + ...publicFolders, + ...publicFolders, + ]); + expect(resultNormal.hash).toEqual(resultWithPossibleDuplicates.hash); + expect(resultNormal.buffer).toEqual(resultWithPossibleDuplicates.buffer); + + const zip = new AdmZip(resultWithPossibleDuplicates.buffer); + const entries = zip + .getEntries() + .map(({ entryName }) => entryName) + // This file is gitignored, so we want to filter it out to make local and CI + // runs consistent. + .filter((entryName) => !entryName.includes('.DS_Store')); + + expect(entries).toHaveLength(84); +}); + +it('can include in-memory content', async () => { + const content = 'hi friends'; + const result = await deterministicArchive( + [tmpdir, ...publicFolders], + [{ name: 'my-in-memory-file.txt', content }], + ); + + const zip = new AdmZip(result.buffer); + const myFile = zip.getEntry('my-in-memory-file.txt'); + expect(myFile.getData().toString()).toBe(content); +}); diff --git a/test/integrations/react-test.js b/test/integrations/react-test.js index 36b7b05..370b518 100644 --- a/test/integrations/react-test.js +++ b/test/integrations/react-test.js @@ -336,8 +336,6 @@ it('works with prerender=false', async () => { 'iframe.html', 'index.html', 'one.jpg', - 'static/', - 'static/media/', 'static/media/1x1.f9992a90.png', ]); // require('fs').writeFileSync('staticPackage.zip', diff --git a/test/prepareAssetsPackage-test.js b/test/prepareAssetsPackage-test.js index f460f62..d18a71b 100644 --- a/test/prepareAssetsPackage-test.js +++ b/test/prepareAssetsPackage-test.js @@ -44,14 +44,15 @@ it('creates deterministic hashes when content has not changed', async () => { }); it('creates consistent results', async () => { - const first = await subject(); - await new Promise((resolve) => { - setTimeout(resolve, 2000); - }); - const filename = path.resolve(__dirname, 'inlineResources/1x1.png'); - fs.utimesSync(filename, new Date(), new Date()); - const second = await subject(); - expect(first).toEqual(second); + const promises = Array.from({ length: 20 }).map(() => subject()); + const results = await Promise.all(promises); + const hashes = results.map(({ hash }) => hash); + + expect(hashes).toHaveLength(20); + expect(hashes[0]).not.toBeUndefined(); + expect(typeof hashes[0]).toBe('string'); + expect(hashes[0].length).toBeGreaterThan(0); + expect(hashes.every((hash) => hash === hashes[0])).toBe(true); }); it('does not fail when files are missing', async () => { @@ -66,7 +67,7 @@ it('picks out the right files', async () => { const zip = new AdmZip(buffer); expect(zip.getEntries().map(({ entryName }) => entryName)).toEqual([ '1x1.jpg', // this is in the root because the css is always served from the root on happo workers - 'one.jpg', 'inlineResources/1x1.png', + 'one.jpg', ]); });