diff --git a/Handler.cpp b/Handler.cpp
index 1a7dbd7..fa5e1a3 100644
--- a/Handler.cpp
+++ b/Handler.cpp
@@ -13,6 +13,7 @@
 #include "RegisterArc.h"
 #include "StringConvert.h"
 #include "libvpk++.h"
+#include "robin_hood.h"
 #include <cwchar>
 
 
@@ -42,7 +43,7 @@ STDMETHODIMP CHandler::Open( IInStream* inStream, const UInt64* maxCheckStartPos
 	try
 	{
 		RINOK( vpk.open( inStream, callback ) );
-		const auto& f = vpk.files().container();
+		const auto& f = vpk.files();
 		int largestId = -1;
 		int preloadSize = 0;
 		UInt64 s = 0;
@@ -81,7 +82,10 @@ STDMETHODIMP CHandler::Open( IInStream* inStream, const UInt64* maxCheckStartPos
 			if ( end > 3 && name[end - 1] == L'r' && name[end - 2] == L'i' && name[end - 3] == L'd' )
 				name.DeleteFrom( end - 3 );
 			else
+			{
+				name.DeleteFrom( end );
 				name += '_';
+			}
 
 			for ( int i = 0; i < largestId + 1; i++ )
 			{
@@ -196,7 +200,7 @@ STDMETHODIMP CHandler::GetArchiveProperty( PROPID propID, PROPVARIANT* value ) M
 STDMETHODIMP CHandler::GetProperty( UInt32 index, PROPID propID, PROPVARIANT* value ) MY_NO_THROW_DECL_ONLY
 {
 	COM_TRY_BEGIN
-		const auto& i = vpk.files().container().at( index );
+		const auto& i = vpk.files().at( index );
 		const auto& item = i.second;
 		NWindows::NCOM::CPropVariant prop;
 		switch ( propID )
@@ -233,7 +237,7 @@ STDMETHODIMP CHandler::GetProperty( UInt32 index, PROPID propID, PROPVARIANT* va
 STDMETHODIMP CHandler::Extract( const UInt32* indices, UInt32 numItems, Int32 testMode, IArchiveExtractCallback* extractCallback ) MY_NO_THROW_DECL_ONLY
 {
 	COM_TRY_BEGIN
-		const auto& f = vpk.files().container();
+		const auto& f = vpk.files();
 		const bool allFilesMode = numItems == (UInt32)(Int32)-1;
 		if ( allFilesMode )
 			numItems = static_cast<UInt32>( f.size() );
@@ -312,7 +316,7 @@ STDMETHODIMP CHandler::Extract( const UInt32* indices, UInt32 numItems, Int32 te
 STDMETHODIMP CHandler::GetStream( UInt32 index, ISequentialInStream** stream )
 {
 	*stream = nullptr;
-	const auto& i = vpk.files().container().at( index ).second;
+	const auto& i = vpk.files().at( index ).second;
 
 	if ( missingFiles && !paks[i.archiveIdx] )
 		return HRESULT_FROM_WIN32( ERROR_FILE_NOT_FOUND );
@@ -335,9 +339,8 @@ STDMETHODIMP CHandler::GetStream( UInt32 index, ISequentialInStream** stream )
 	auto& preload = limitedStream->Buffer;
 	preload.Alloc( i.preloadLength );
 
-	size_t size = i.preloadLength;
 	RINOK( basePak->Seek( i.preloadOffset, STREAM_SEEK_SET, nullptr ) );
-	RINOK( ReadStream( basePak, preload, &size ) );
+	RINOK( ReadStream_FAIL( basePak, preload, i.preloadLength ) );
 
 	limitedStream->SetCache( i.preloadLength, 0 );
 	RINOK( limitedStream->InitAndSeek( 0, i.fileLength ) );
@@ -440,7 +443,7 @@ void CRC32_Final( CRC32_t& pulCRC )
 void CRC32_ProcessBuffer( CRC32_t& pulCRC, const void* pBuffer, int nBuffer )
 {
 	CRC32_t ulCrc = pulCRC;
-	unsigned char* pb = (unsigned char*)pBuffer;
+	auto pb = reinterpret_cast<const unsigned char*>( pBuffer );
 	unsigned int nFront;
 	int nMain;
 
@@ -449,36 +452,36 @@ void CRC32_ProcessBuffer( CRC32_t& pulCRC, const void* pBuffer, int nBuffer )
 	switch ( nBuffer )
 	{
 	case 7:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 
 	case 6:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 
 	case 5:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 
 	case 4:
-		ulCrc ^= *(CRC32_t*)pb;
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc ^= *reinterpret_cast<const CRC32_t*>( pb );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		pulCRC = ulCrc;
 		return;
 
 	case 3:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 
 	case 2:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 
 	case 1:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 
 	case 0:
@@ -492,33 +495,33 @@ void CRC32_ProcessBuffer( CRC32_t& pulCRC, const void* pBuffer, int nBuffer )
 	// The low-order two bits of pb and nBuffer in total control the
 	// upfront work.
 	//
-	nFront = ( (uintptr_t)pb ) & 3;
+	nFront = reinterpret_cast<uintptr_t>( pb ) & 3;
 	nBuffer -= nFront;
 	switch ( nFront )
 	{
 	case 3:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 	case 2:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		[[fallthrough]];
 	case 1:
-		ulCrc = pulCRCTable[*pb++ ^ (unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[*pb++ ^ static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 	}
 
 	nMain = nBuffer >> 3;
 	while ( nMain-- )
 	{
-		ulCrc ^= *(CRC32_t*)pb;
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc ^= *(CRC32_t*)( pb + 4 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
-		ulCrc = pulCRCTable[(unsigned char)ulCrc] ^ ( ulCrc >> 8 );
+		ulCrc ^= *reinterpret_cast<const CRC32_t*>( pb );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc ^= *reinterpret_cast<const CRC32_t*>( pb + 4 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
+		ulCrc = pulCRCTable[static_cast<unsigned char>( ulCrc )] ^ ( ulCrc >> 8 );
 		pb += 8;
 	}
 
@@ -526,9 +529,9 @@ void CRC32_ProcessBuffer( CRC32_t& pulCRC, const void* pBuffer, int nBuffer )
 	goto JustAfew;
 }
 
-static constexpr const size_t kCacheBlockSize = (1 << 20);
-static constexpr const size_t kCacheSize = (kCacheBlockSize << 2);
-static constexpr const size_t kCacheMask = (kCacheSize - 1);
+static constexpr const size_t kCacheBlockSize = 1 << 20;
+static constexpr const size_t kCacheSize = kCacheBlockSize << 2;
+static constexpr const size_t kCacheMask = kCacheSize - 1;
 
 class CCacheOutStream : public IOutStream, public CMyUnknownImp
 {
@@ -545,7 +548,7 @@ class CCacheOutStream : public IOutStream, public CMyUnknownImp
 	HRESULT MyWrite( size_t size );
 	HRESULT MyWriteBlock()
 	{
-		return MyWrite( kCacheBlockSize - ( (size_t)_cachedPos & ( kCacheBlockSize - 1 ) ) );
+		return MyWrite( kCacheBlockSize - ( static_cast<size_t>( _cachedPos ) & ( kCacheBlockSize - 1 ) ) );
 	}
 public:
 	CCacheOutStream() : _cache( nullptr ) {}
@@ -564,7 +567,7 @@ class CCacheOutStream : public IOutStream, public CMyUnknownImp
 bool CCacheOutStream::Allocate()
 {
 	if ( !_cache )
-		_cache = ( Byte* )::MidAlloc( kCacheSize );
+		_cache = static_cast<Byte*>( ::MidAlloc( kCacheSize ) );
 	return _cache != nullptr;
 }
 
@@ -598,7 +601,7 @@ HRESULT CCacheOutStream::MyWrite( size_t size )
 				return E_FAIL;
 			RINOK( _stream->Seek( _cachedPos, STREAM_SEEK_SET, &_phyPos ) );
 		}
-		size_t pos = (size_t)_cachedPos & kCacheMask;
+		size_t pos = static_cast<size_t>( _cachedPos ) & kCacheMask;
 		size_t curSize = MyMin( kCacheSize - pos, _cachedSize );
 		curSize = MyMin( curSize, size );
 		RINOK( WriteStream( _seqStream, _cache + pos, curSize ) );
@@ -669,10 +672,10 @@ STDMETHODIMP CCacheOutStream::Write( const void* data, UInt32 size, UInt32* proc
 		for ( ;;)
 		{
 			UInt64 cachedEnd = _cachedPos + _cachedSize;
-			size_t endPos = (size_t)cachedEnd & kCacheMask;
+			size_t endPos = static_cast<size_t>( cachedEnd ) & kCacheMask;
 			size_t curSize = kCacheSize - endPos;
 			if ( curSize > _virtPos - cachedEnd )
-				curSize = (size_t)( _virtPos - cachedEnd );
+				curSize = static_cast<size_t>( _virtPos - cachedEnd );
 			if ( curSize == 0 )
 				break;
 			while ( curSize > ( kCacheSize - _cachedSize ) )
@@ -687,11 +690,11 @@ STDMETHODIMP CCacheOutStream::Write( const void* data, UInt32 size, UInt32* proc
 	if ( _cachedSize == 0 )
 		_cachedPos = _virtPos;
 
-	size_t pos = (size_t)_virtPos & kCacheMask;
-	size = (UInt32)MyMin( (size_t)size, kCacheSize - pos );
+	size_t pos = static_cast<size_t>( _virtPos ) & kCacheMask;
+	size = static_cast<UInt32>( MyMin( static_cast<size_t>( size ), kCacheSize - pos ) );
 	UInt64 cachedEnd = _cachedPos + _cachedSize;
 	if ( _virtPos != cachedEnd ) // _virtPos < cachedEnd
-		size = (UInt32)MyMin( (size_t)size, (size_t)( cachedEnd - _virtPos ) );
+		size = static_cast<UInt32>( MyMin( static_cast<size_t>( size ), static_cast<size_t>( cachedEnd - _virtPos ) ) );
 	else
 	{
 		// _virtPos == cachedEnd
@@ -699,9 +702,9 @@ STDMETHODIMP CCacheOutStream::Write( const void* data, UInt32 size, UInt32* proc
 		{
 			RINOK( MyWriteBlock() );
 		}
-		size_t startPos = (size_t)_cachedPos & kCacheMask;
+		size_t startPos = static_cast<size_t>( _cachedPos & kCacheMask );
 		if ( startPos > pos )
-			size = (UInt32)MyMin( (size_t)size, (size_t)( startPos - pos ) );
+			size = static_cast<UInt32>( MyMin( static_cast<size_t>( size ), static_cast<size_t>( startPos - pos ) ) );
 		_cachedSize += size;
 	}
 	memcpy( _cache + pos, data, size );
@@ -746,7 +749,7 @@ STDMETHODIMP CCacheOutStream::SetSize( UInt64 newSize )
 		_cachedPos = newSize;
 	}
 	if ( newSize < _cachedPos + _cachedSize )
-		_cachedSize = (size_t)( newSize - _cachedPos );
+		_cachedSize = static_cast<size_t>( newSize - _cachedPos );
 	return S_OK;
 }
 
@@ -767,17 +770,36 @@ class VpkWriter
 
 	HRESULT addItem( AString internalPath, UInt32 size, IInStream* stream )
 	{
+		using namespace std::string_literals;
 		internalPath.MakeLower_Ascii();
 		const std::string_view path{ internalPath.Ptr(), internalPath.Len() };
-		for ( size_t i = 0; i < ARRAYSIZE( bannedExts ); ++i )
+		for ( size_t i = 0; i < std::size( bannedExts ); ++i )
 			if ( path.ends_with( bannedExts[i] ) )
 				return E_FAIL;
-
-		const auto spl = path.rfind( '/' );
-		auto& dir = resolvePath( root, spl == std::string::npos ? std::string_view{} : path.substr( 0, spl ), {} );
-		const auto& name = path.substr( spl + 1 );
-
-		dir.files.emplace( name, Dir::File{ size, 0, stream } );
+		const auto extOffset = path.rfind( '.' );
+		const auto nameOffset = path.rfind( '/', extOffset );
+		const auto name = nameOffset != std::string_view::npos ? std::string{ path.substr( nameOffset + 1, extOffset - nameOffset - 1 ) } : std::string{ path.substr( 0, extOffset ) };
+		m_exts.try_emplace( extOffset != std::string_view::npos ? std::string{ path.substr(extOffset + 1) } : " "s ).
+			first->second.try_emplace( nameOffset != std::string_view::npos ? std::string{ path.substr(0, nameOffset) } : " "s ).
+				first->second.try_emplace( name.empty() ? " "s : std::move( name ), size, 0, stream );
+		if ( m_needFixup != 2 )
+		{
+			if ( const auto firstSep = path.find( '/' ); firstSep != std::string_view::npos )
+			{
+				const std::string_view d = path.substr( 0, firstSep );
+				size_t i = 0;
+				for (; i < std::size( standardDirs ); ++i )
+				{
+					if ( standardDirs[i] == d )
+						break;
+				}
+				if ( !m_lastDir.empty() && m_lastDir != d && i == std::size( standardDirs ) && m_needFixup == 1 )
+					m_needFixup = 2;
+				else if ( i == std::size( standardDirs ) )
+					m_needFixup = 1;
+				m_lastDir = d;
+			}
+		}
 		return S_OK;
 	}
 
@@ -796,60 +818,58 @@ class VpkWriter
 			return E_OUTOFMEMORY;
 		RINOK( stream->Init( outStream, stream_ ) );
 
-		if ( root.files.empty() && root.folders.size() == 1 )
+		if ( m_needFixup == 1 )
 		{
-			auto& realRoot = root.folders.modify_container().at( 0 );
-			for ( size_t i = 0; i < ARRAYSIZE( standardDirs ); ++i )
+			for ( auto &ext : m_exts )
 			{
-				if ( realRoot.first == standardDirs[i] )
-					goto dont;
+				robin_hood::unordered_node_map<std::string, robin_hood::unordered_node_map<std::string, File>> tmp;
+				for ( auto& dir : ext.second )
+					tmp.emplace( dir.first.substr( m_lastDir.size() + 1 ), std::move( dir.second ) );
+				ext.second = std::move( tmp );
 			}
-
-			chobo::flat_map<std::string, Dir> rootFolders = std::move( realRoot.second.folders );
-			chobo::flat_map<std::string, Dir::File> rootFiles = std::move( realRoot.second.files );
-
-			root.folders.clear();
-
-			root.folders = std::move( rootFolders );
-			root.files = std::move( rootFiles );
-
-			recurseRemoveRootName( root );
 		}
-	dont:
 
-		UInt32 size = 0, treeSize = 0;
-		FilesByExt sorted_files;
-		sortFiles( sorted_files, root, size, treeSize );
-
-		for ( auto& [ext, dirs] : sorted_files )
-			treeSize += static_cast<UInt32>( dirs.size() + 1 ); // add null after each all files in each directory + null after last dir
-		++treeSize; // null after last ext
+		constexpr const auto vpkMetaSize = 3 * sizeof( Int32 ) + 3 * sizeof( Int16 );
+		UInt32 size = 0, treeSize = 1;
+		for ( auto &ext : m_exts )
+		{
+			treeSize += static_cast<UInt32>( ext.first.size() + 2 ); // +1 string null, +1 last null terminator
+			for ( auto& dir : ext.second )
+			{
+				treeSize += static_cast<UInt32>( dir.first.size() + 2 ); // +1 string null, +1 last null terminator
+				for ( auto& file : dir.second )
+				{
+					treeSize += static_cast<UInt32>( file.first.size() + 1 + vpkMetaSize );
+					size += file.second.size;
+				}
+			}
+		}
 
-		RINOK( writeHeader( stream, size, treeSize ) );
+		RINOK( writeHeader( stream, volSize ? 0 : size, treeSize ) );
 
 		UInt16 curPak = volSize > 0 ? 0 : 0x7FFF;
 		UInt32 currentOffset = 0;
-		for ( auto& [ext, dirs] : sorted_files )
+		for ( auto& ext : m_exts )
 		{
-			RINOK( write( stream, ext ) );
-			for ( auto& [dir, files] : dirs )
+			RINOK( write( stream, ext.first ) );
+			for ( auto& dir : ext.second )
 			{
-				RINOK( write( stream, dir ) );
-				for ( auto& [file, data] : files )
+				RINOK( write( stream, dir.first ) );
+				for ( auto& file : dir.second )
 				{
 					UInt64 pos;
 					RINOK( stream->Seek( 0, STREAM_SEEK_CUR, &pos ) );
 					progress->InSize = progress->OutSize = pos;
 					RINOK( progress->SetCur() );
-					RINOK( write( stream, file ) );
-					RINOK( write<UInt32>( stream, calcCrc( *data ) ) );
+					RINOK( write( stream, file.first ) );
+					RINOK( write<UInt32>( stream, calcCrc( file.second ) ) );
 					RINOK( write<UInt16>( stream, 0 ) );
 					RINOK( write<UInt16>( stream, curPak ) );
 					RINOK( write<UInt32>( stream, currentOffset ) );
-					RINOK( write<UInt32>( stream, data->size ) );
+					RINOK( write<UInt32>( stream, file.second.size ) );
 					RINOK( write<UInt16>( stream, 0xFFFF ) );
-					currentOffset += data->size;
-					data->pak = curPak;
+					currentOffset += file.second.size;
+					file.second.pak = curPak;
 
 					if ( volSize && currentOffset > volSize )
 					{
@@ -867,18 +887,18 @@ class VpkWriter
 
 		if ( !volSize )
 		{
-			for ( auto& [ext, dirs] : sorted_files )
+			for ( auto& ext : m_exts )
 			{
-				for ( auto& [dir, files] : dirs )
+				for ( auto& dir : ext.second )
 				{
-					for ( auto& [file, data] : files )
+					for ( auto& file : dir.second )
 					{
 						UInt64 pos;
 						RINOK( stream->Seek( 0, STREAM_SEEK_CUR, &pos ) );
 						progress->InSize = progress->OutSize = pos;
 						RINOK( progress->SetCur() );
-						RINOK( copyCoder->Code( data->stream, stream, nullptr, nullptr, progress ) );
-						data->stream.Release();
+						RINOK( copyCoder->Code( file.second.stream, stream, nullptr, nullptr, progress ) );
+						file.second.stream.Release();
 					}
 				}
 			}
@@ -887,25 +907,25 @@ class VpkWriter
 		{
 			CMyComPtr<ISequentialOutStream> pakStream;
 			UInt16 lastPak = 0xFFFF;
-			UInt32 written = 0;
-			for ( auto& [ext, dirs] : sorted_files )
+			UInt64 written = 0;
+			for ( auto& ext : m_exts )
 			{
-				for ( auto& [dir, files] : dirs )
+				for ( auto& dir : ext.second )
 				{
-					for ( auto& [file, data] : files )
+					for ( auto& file : dir.second )
 					{
-						if ( lastPak != data->pak )
+						if ( lastPak != file.second.pak )
 						{
-							lastPak = data->pak;
+							lastPak = file.second.pak;
 							pakStream.Release();
 							RINOK( callback->GetVolumeStream( static_cast<UInt32>( lastPak - 1 ), &pakStream ) );
 						}
 
 						progress->InSize = progress->OutSize = written;
-						written += data->size;
+						written += file.second.size;
 						RINOK( progress->SetCur() );
-						RINOK( copyCoder->Code( data->stream, pakStream, nullptr, nullptr, progress ) );
-						data->stream.Release();
+						RINOK( copyCoder->Code( file.second.stream, pakStream, nullptr, nullptr, progress ) );
+						file.second.stream.Release();
 					}
 				}
 			}
@@ -939,70 +959,14 @@ class VpkWriter
 		return stream->Write( &header, sizeof( header ), nullptr );
 	}
 
-	struct Dir
+	struct File
 	{
-		struct File
-		{
-			UInt32 size = 0;
-			UInt16 pak = 0;
-			CMyComPtr<IInStream> stream;
-		};
-
-		std::string name;
-		chobo::flat_map<std::string, Dir> folders;
-		chobo::flat_map<std::string, File> files;
+		UInt32 size = 0;
+		UInt16 pak = 0;
+		CMyComPtr<IInStream> stream;
 	};
-	Dir root;
-
-	using SortedFiles = chobo::flat_map<std::string, Dir::File*>;
-	using FilesByFolder = chobo::flat_map<std::string, SortedFiles>;
-	using FilesByExt = chobo::flat_map<std::string, FilesByFolder>;
-
-	static Dir& resolvePath( Dir& root, const std::string_view& path, const std::string_view& name )
-	{
-		if ( path.empty() )
-			return root;
-
-		const auto sep = path.find( '/' );
-		auto res = root.folders.emplace( path.substr( 0, sep ), Dir{} );
-		if ( res.second )
-			res.first->second.name = name.empty() ? res.first->first : std::string( name ) + '/' + res.first->first;
-		return resolvePath( res.first->second, sep == std::string::npos ? std::string_view{} : path.substr( sep + 1 ), res.first->second.name );
-	}
-
-	static void recurseRemoveRootName( Dir& root )
-	{
-		for ( auto& f : root.folders )
-		{
-			f.second.name = f.second.name.substr( f.second.name.find( '/' ) + 1 );
-			recurseRemoveRootName( f.second );
-		}
-	}
 
-	static void sortFiles( FilesByExt& files, Dir& root, UInt32& size, UInt32& treeSize )
-	{
-		constexpr const auto vpkMetaSize = 3 * sizeof( Int32 ) + 3 * sizeof( Int16 );
-
-		using namespace std::string_view_literals;
-		for ( auto& [name, file] : root.files )
-		{
-			const auto sep = name.rfind( '.' );
-			auto r = files.emplace( sep != std::string::npos ? name.substr( sep + 1 ) : " "sv, FilesByFolder{} );
-			auto r2 = r.first->second.emplace( root.name.empty() ? " "sv : root.name, SortedFiles{} );
-			r2.first->second.emplace( name.substr( 0, sep ), &file );
-			size += file.size;
-			if ( r.second ) // if new
-				treeSize += static_cast<UInt32>( r.first->first.size() + 1 ); // extension size
-			if ( r2.second ) // if new
-				treeSize += static_cast<UInt32>( r2.first->first.size() + 1 ); // directory size
-			treeSize += static_cast<UInt32>( ( sep == std::string::npos ? name.size() : sep ) + 1 + vpkMetaSize ); // file name + header
-		}
-
-		for ( auto& folder : root.folders )
-			sortFiles( files, folder.second, size, treeSize );
-	}
-
-	static CRC32_t calcCrc( Dir::File& file )
+	static CRC32_t calcCrc( File& file )
 	{
 		CRC32_t crc;
 		CRC32_Init( crc );
@@ -1020,11 +984,14 @@ class VpkWriter
 
 		return crc;
 	}
-};
 
+	robin_hood::unordered_node_map<std::string, robin_hood::unordered_node_map<std::string, robin_hood::unordered_node_map<std::string, File>>> m_exts;
+	signed char m_needFixup = 0;
+	std::string m_lastDir;
+};
 
-static const wchar_t kOsPathSepar = WCHAR_PATH_SEPARATOR;
-static const wchar_t kUnixPathSepar = L'/';
+static constexpr const wchar_t kOsPathSepar = WCHAR_PATH_SEPARATOR;
+static constexpr const wchar_t kUnixPathSepar = L'/';
 
 static void ReplaceSlashes_OsToUnix( UString& name )
 {
@@ -1139,6 +1106,6 @@ REGISTER_ARC_IO(
 	"VPK", "vpk", 0, 1,
 	k_Signature,
 	0,
-	NArcInfoFlags::kMultiSignature | NArcInfoFlags::kUseGlobalOffset | NArcInfoFlags::kPureStartOpen,
+	NArcInfoFlags::kMultiSignature | NArcInfoFlags::kUseGlobalOffset | NArcInfoFlags::kPureStartOpen | NArcInfoFlags::kByExtOnlyOpen,
 	IsArc_Vpk, 1
 )
\ No newline at end of file
diff --git a/IArchive.h b/IArchive.h
index 1677551..fecd08a 100644
--- a/IArchive.h
+++ b/IArchive.h
@@ -34,6 +34,7 @@ namespace NArcInfoFlags
   const UInt32 kPreArc          = 1 << 9;  // such archive can be stored before real archive (like SFX stub)
   const UInt32 kSymLinks        = 1 << 10; // the handler supports symbolic links
   const UInt32 kHardLinks       = 1 << 11; // the handler supports hard links
+  const UInt32 kByExtOnlyOpen   = 1 << 12; // call handler only if file extension matches
 }
 
 namespace NArchive
diff --git a/MyString.h b/MyString.h
index 40de52c..861ec77 100644
--- a/MyString.h
+++ b/MyString.h
@@ -217,21 +217,21 @@ bool StringsAreEqualNoCase_Ascii(const wchar_t *s1, const wchar_t *s2) throw();
 
 
 #define FORBID_STRING_OPS_2(cls, t) \
-  void Find(t) const; \
-  void Find(t, unsigned startIndex) const; \
-  void ReverseFind(t) const; \
-  void InsertAtFront(t); \
-  void RemoveChar(t); \
-  void Replace(t, t); \
+  void Find(t) const = delete; \
+  void Find(t, unsigned startIndex) const = delete; \
+  void ReverseFind(t) const = delete; \
+  void InsertAtFront(t) = delete; \
+  void RemoveChar(t) = delete; \
+  void Replace(t, t) = delete; \
 
 #define FORBID_STRING_OPS(cls, t) \
-  explicit cls(t); \
-  explicit cls(const t *); \
-  cls &operator=(t); \
-  cls &operator=(const t *); \
-  cls &operator+=(t); \
-  cls &operator+=(const t *); \
-  FORBID_STRING_OPS_2(cls, t); \
+  explicit cls(t) = delete; \
+  explicit cls(const t *) = delete; \
+  cls &operator=(t) = delete; \
+  cls &operator=(const t *) = delete; \
+  cls &operator+=(t) = delete; \
+  cls &operator+=(const t *) = delete; \
+  FORBID_STRING_OPS_2(cls, t) \
 
 /*
   cls &operator+(t); \
diff --git a/VpkHandler.vcxproj b/VpkHandler.vcxproj
index 3c339a8..25b010f 100644
--- a/VpkHandler.vcxproj
+++ b/VpkHandler.vcxproj
@@ -29,26 +29,26 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
@@ -96,6 +96,7 @@
       <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
       <LanguageStandard>stdcpplatest</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -137,6 +138,7 @@
       <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
       <LanguageStandard>stdcpplatest</LanguageStandard>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -158,6 +160,7 @@
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <LanguageStandard>stdcpplatest</LanguageStandard>
       <DebugInformationFormat>None</DebugInformationFormat>
+      <AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -189,7 +192,6 @@
     <ClInclude Include="CopyCoder.h" />
     <ClInclude Include="CpuArch.h" />
     <ClInclude Include="Defs.h" />
-    <ClInclude Include="flat_map.hpp" />
     <ClInclude Include="IArchive.h" />
     <ClInclude Include="ICoder.h" />
     <ClInclude Include="IDecl.h" />
@@ -212,6 +214,7 @@
     <ClInclude Include="PropVariant.h" />
     <ClInclude Include="RegisterArc.h" />
     <ClInclude Include="RegisterCodec.h" />
+    <ClInclude Include="robin_hood.h" />
     <ClInclude Include="StreamUtils.h" />
     <ClInclude Include="StringConvert.h" />
   </ItemGroup>
diff --git a/VpkHandler.vcxproj.filters b/VpkHandler.vcxproj.filters
index 44d6ae6..cd23689 100644
--- a/VpkHandler.vcxproj.filters
+++ b/VpkHandler.vcxproj.filters
@@ -74,9 +74,6 @@
     <ClInclude Include="Defs.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="flat_map.hpp">
-      <Filter>Header Files</Filter>
-    </ClInclude>
     <ClInclude Include="IArchive.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -140,6 +137,9 @@
     <ClInclude Include="RegisterCodec.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="robin_hood.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="StringConvert.h">
       <Filter>Header Files</Filter>
     </ClInclude>
diff --git a/flat_map.hpp b/flat_map.hpp
deleted file mode 100644
index e5f9afe..0000000
--- a/flat_map.hpp
+++ /dev/null
@@ -1,757 +0,0 @@
-// chobo-flat-map v1.02
-//
-// std::map-like class with an underlying vector
-//
-// MIT License:
-// Copyright(c) 2016-2020 Chobolabs Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining
-// a copy of this software and associated documentation files(the
-// "Software"), to deal in the Software without restriction, including
-// without limitation the rights to use, copy, modify, merge, publish,
-// distribute, sublicense, and / or sell copies of the Software, and to
-// permit persons to whom the Software is furnished to do so, subject to
-// the following conditions :
-//
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-//
-//                  VERSION HISTORY
-//
-//  1.02 (2020-10-14) Added forgotten forward of arguments of emplace
-//  1.01 (2016-09-27) Fix for keys with no operator==. Clean up of assignment.
-//                    Added swap method.
-//  1.00 (2016-09-23) First public release
-//
-//
-//                  DOCUMENTATION
-//
-// Simply include this file wherever you need.
-// It defines the class chobo::flat_map, which is an almsot drop-in replacement
-// of std::map. Flat map has an optional underlying container which by default
-// is std::vector. Thus the items in the map are in a continuous block of
-// memory. Thus iterating over the map is cache friendly, at the cost of
-// O(n) for insert and erase.
-//
-// The elements inside (like in std::map) are kept in an order sorted by key.
-// Getting a value by key is O(log2 n)
-//
-// It generally performs much faster than std::map for smaller sets of elements
-//
-// The difference with std::map, which makes flat_map an not-exactly-drop-in
-// replacement is the last template argument:
-// * std::map has <key, value, compare, allocator>
-// * chobo::flat_map has <key, value, compare, container>
-// The container must be an std::vector compatible type (chobo::static_vector
-// and chobo::vector_ptr are, for example, viable). The container value type
-// must be std::pair<key, value>.
-//
-//                  Changing the allocator.
-//
-// If you want to change the allocator of flat map, you'll have to provide a
-// container with the appriate one. Example:
-//
-// chobo::flat_map<
-//      string,
-//      int,
-//      less<string>,
-//      std::vector<pair<string, int>, MyAllocator<pair<string, int>>
-//  > mymap
-//
-//
-//                  Configuration
-//
-// chobo::flat_map has two configurable settings:
-//
-// 1. Throw
-// Whether to throw exceptions: when `at` is called with a non-existent key.
-// By default, like std::map, it throws an std::out_of_range exception. If you define
-// CHOBO_FLAT_MAP_NO_THROW before including this header, the exception will
-// be substituted by an assertion.
-//
-// 2. const char* overloads
-// By default chobo::flat_map provides overloads for the access methods
-// (at, operator[], find, lower_bound, count) for const char* for cases when
-// std::string is the key, so that no allocations happen when accessing with
-// a C-string of a string literal.
-// However if const char* or any other class with implicit conversion from
-// const char* is the key, they won't compile.
-// If you plan on using flat_map with such keys, you'll need to define
-// CHOBO_FLAT_MAP_NO_CONST_CHAR_OVERLOADS before including the header
-//
-//
-//                  TESTS
-//
-// The tests are included in the header file and use doctest (https://github.com/onqtam/doctest).
-// To run them, define CHOBO_FLAT_MAP_TEST_WITH_DOCTEST before including
-// the header in a file which has doctest.h already included.
-//
-// Additionally if chobo::static_vector is also available you may define
-// CHOBO_FLAT_MAP_TEST_STATIC_VECTOR_WITH_DOCTEST to test flat_map with an
-// unrelying static_vector
-//
-// Additionally if chobo::vector_ptr is also available you may define
-// CHOBO_FLAT_MAP_TEST_VECTOR_PTR_WITH_DOCTEST to test flat_map with an
-// unrelying vector_ptr
-//
-#pragma once
-
-#include <vector>
-#include <algorithm>
-#include <type_traits>
-
-#if !defined(CHOBO_FLAT_MAP_NO_CONST_CHAR_OVERLOADS)
-#include <cstring>
-#endif
-
-#if !defined(CHOBO_FLAT_MAP_NO_THROW)
-#   include <stdexcept>
-#   define _CHOBO_THROW_FLAT_MAP_OUT_OF_RANGE() throw std::out_of_range("chobo::flat_map out of range")
-#else
-#   include <cassert>
-#   define _CHOBO_THROW_FLAT_MAP_OUT_OF_RANGE() assert(false && "chobo::flat_map out of range")
-#endif
-
-namespace chobo
-{
-
-template <typename Key, typename T, typename Compare = std::less<Key>, typename Container = std::vector<std::pair<Key, T>>>
-class flat_map
-{
-public:
-    typedef Key key_type;
-    typedef T mapped_type;
-    typedef std::pair<Key, T> value_type;
-    typedef Container container_type;
-    typedef Compare key_compare;
-    typedef value_type& reference;
-    typedef const value_type& const_reference;
-    typedef typename container_type::allocator_type allocator_type;
-    typedef typename std::allocator_traits<allocator_type>::pointer pointer;
-    typedef typename std::allocator_traits<allocator_type>::pointer const_pointer;
-    typedef typename container_type::iterator iterator;
-    typedef typename container_type::const_iterator const_iterator;
-    typedef typename container_type::reverse_iterator reverse_iterator;
-    typedef typename container_type::const_reverse_iterator const_reverse_iterator;
-    typedef typename container_type::difference_type difference_type;
-    typedef typename container_type::size_type size_type;
-
-    flat_map()
-    {}
-
-    explicit flat_map(const key_compare& comp, const allocator_type& alloc = allocator_type())
-        : m_cmp(comp)
-        , m_container(alloc)
-    {}
-
-    flat_map(const flat_map& x) = default;
-    flat_map(flat_map&& x) = default;
-
-    flat_map(std::initializer_list<value_type> ilist) : m_cmp(Compare())
-    {
-        m_container.reserve(ilist.size());
-        for (auto&& il : ilist)
-            emplace(il);
-    }
-
-    flat_map& operator=(const flat_map& x)
-    {
-        m_cmp = x.m_cmp;
-        m_container = x.m_container;
-        return *this;
-    }
-    flat_map& operator=(flat_map&& x) noexcept
-    {
-        m_cmp = std::move(x.m_cmp);
-        m_container = std::move(x.m_container);
-        return *this;
-    }
-
-    iterator begin() noexcept { return m_container.begin(); }
-    const_iterator begin() const noexcept { return m_container.begin(); }
-    iterator end() noexcept { return m_container.end(); }
-    const_iterator end() const noexcept { return m_container.end(); }
-    reverse_iterator rbegin() noexcept { return m_container.rbegin(); }
-    const_reverse_iterator rbegin() const noexcept { return m_container.rbegin(); }
-    reverse_iterator rend() noexcept { return m_container.rend(); }
-    const_reverse_iterator rend() const noexcept { return m_container.rend(); }
-    const_iterator cbegin() const noexcept { return m_container.cbegin(); }
-    const_iterator cend() const noexcept { return m_container.cend(); }
-
-    bool empty() const noexcept { return m_container.empty(); }
-    size_type size() const noexcept { return m_container.size(); }
-    size_type max_size() const noexcept { return m_container.max_size(); }
-
-    void reserve(size_type count) { return m_container.reserve(count); }
-    size_type capacity() const noexcept { return m_container.capacity(); }
-
-    void clear() noexcept { m_container.clear(); }
-
-    iterator lower_bound(const key_type& k)
-    {
-        return std::lower_bound(m_container.begin(), m_container.end(), k, m_cmp);
-    }
-
-    const_iterator lower_bound(const key_type& k) const
-    {
-        return std::lower_bound(m_container.begin(), m_container.end(), k, m_cmp);
-    }
-
-    iterator find(const key_type& k)
-    {
-        auto i = lower_bound(k);
-        if (i != end() && !m_cmp(k, *i))
-            return i;
-
-        return end();
-    }
-
-    const_iterator find(const key_type& k) const
-    {
-        auto i = lower_bound(k);
-        if (i != end() && !m_cmp(k, *i))
-            return i;
-
-        return end();
-    }
-
-    size_t count(const key_type& k) const
-    {
-        return find(k) == end() ? 0 : 1;
-    }
-
-    template <typename P>
-    std::pair<iterator, bool> insert(P&& val)
-    {
-        auto i = lower_bound(val.first);
-        if (i != end() && !m_cmp(val.first, *i))
-        {
-            return { i, false };
-        }
-
-        return{ m_container.emplace(i, std::forward<P>(val)), true };
-    }
-
-    std::pair<iterator, bool> insert(const value_type& val)
-    {
-        auto i = lower_bound(val.first);
-        if (i != end() && !m_cmp(val.first, *i))
-        {
-            return { i, false };
-        }
-
-        return{ m_container.emplace(i, val), true };
-    }
-
-    template <typename... Args>
-    std::pair<iterator, bool> emplace(Args&&... args)
-    {
-        value_type val(std::forward<Args>(args)...);
-        return insert(std::move(val));
-    }
-
-    iterator erase(const_iterator pos)
-    {
-        return m_container.erase(pos);
-    }
-
-    size_type erase(const key_type& k)
-    {
-        auto i = find(k);
-        if (i == end())
-        {
-            return 0;
-        }
-
-        erase(i);
-        return 1;
-    }
-
-    mapped_type& operator[](const key_type& k)
-    {
-        auto i = lower_bound(k);
-        if (i != end() && !m_cmp(k, *i))
-        {
-            return i->second;
-        }
-
-        i = m_container.emplace(i, k, mapped_type());
-        return i->second;
-    }
-
-    mapped_type& operator[](key_type&& k)
-    {
-        auto i = lower_bound(k);
-        if (i != end() && !m_cmp(k, *i))
-        {
-            return i->second;
-        }
-
-        i = m_container.emplace(i, std::forward<key_type>(k), mapped_type());
-        return i->second;
-    }
-
-    mapped_type& at(const key_type& k)
-    {
-        auto i = lower_bound(k);
-        if (i == end() || m_cmp(*i, k))
-        {
-            _CHOBO_THROW_FLAT_MAP_OUT_OF_RANGE();
-        }
-
-        return i->second;
-    }
-
-    const mapped_type& at(const key_type& k) const
-    {
-        auto i = lower_bound(k);
-        if (i == end() || m_cmp(*i, k))
-        {
-            _CHOBO_THROW_FLAT_MAP_OUT_OF_RANGE();
-        }
-
-        return i->second;
-    }
-
-    void swap(flat_map& x)
-    {
-        std::swap(m_cmp, x.m_cmp);
-        m_container.swap(x.m_container);
-    }
-
-    const container_type& container() const noexcept
-    {
-        return m_container;
-    }
-
-    // DANGER! If you're not careful with this function, you may irreversably break the map
-    container_type& modify_container() noexcept
-    {
-        return m_container;
-    }
-
-#if !defined(CHOBO_FLAT_MAP_NO_CONST_CHAR_OVERLOADS)
-    ///////////////////////////////////////////////////////////////////////////////////
-    // const char* overloads for maps with an std::string key to avoid allocs
-    iterator lower_bound(const char* k)
-    {
-        static_assert(std::is_same<std::string, key_type>::value, "flat_map::lower_bound(const char*) works only for std::strings");
-        static_assert(std::is_same<std::less<std::string>, key_compare>::value, "flat_map::lower_bound(const char*) works only for std::string-s, compared with std::less<std::string>");
-        return std::lower_bound(m_container.begin(), m_container.end(), k, [](const value_type& a, const char* b) -> bool
-        {
-            return strcmp(a.first.c_str(), b) < 0;
-        });
-    }
-
-    const_iterator lower_bound(const char* k) const
-    {
-        static_assert(std::is_same<std::string, key_type>::value, "flat_map::lower_bound(const char*) works only for std::strings");
-        static_assert(std::is_same<std::less<std::string>, key_compare>::value, "flat_map::lower_bound(const char*) works only for std::string-s, compared with std::less<std::string>");
-        return std::lower_bound(m_container.begin(), m_container.end(), k, [](const value_type& a, const char* b) -> bool
-        {
-            return strcmp(a.first.c_str(), b) < 0;
-        });
-    }
-
-    mapped_type& operator[](const char* k)
-    {
-        auto i = lower_bound(k);
-        if (i != end() && i->first == k)
-        {
-            return i->second;
-        }
-
-        i = m_container.emplace(i, k, mapped_type());
-        return i->second;
-    }
-
-    mapped_type& at(const char* k)
-    {
-        auto i = lower_bound(k);
-        if (i == end() || i->first != k)
-        {
-            _CHOBO_THROW_FLAT_MAP_OUT_OF_RANGE();
-        }
-
-        return i->second;
-    }
-
-    const mapped_type& at(const char* k) const
-    {
-        auto i = lower_bound(k);
-        if (i == end() || i->first != k)
-        {
-            _CHOBO_THROW_FLAT_MAP_OUT_OF_RANGE();
-        }
-
-        return i->second;
-    }
-
-    iterator find(const char* k)
-    {
-        auto i = lower_bound(k);
-        if (i != end() && i->first == k)
-            return i;
-
-        return end();
-    }
-
-    const_iterator find(const char* k) const
-    {
-        auto i = lower_bound(k);
-        if (i != end() && i->first == k)
-            return i;
-
-        return end();
-    }
-
-    size_t count(const char* k) const
-    {
-        return find(k) == end() ? 0 : 1;
-    }
-
-#endif // !defined(CHOBO_FLAT_MAP_NO_CONST_CHAR_OVERLOADS)
-
-private:
-    struct pair_compare
-    {
-        pair_compare() = default;
-        pair_compare(const key_compare& kc) : kcmp(kc) {}
-        bool operator()(const value_type& a, const key_type& b) const
-        {
-            return kcmp(a.first, b);
-        }
-
-        bool operator()(const key_type& a, const value_type& b) const
-        {
-            return kcmp(a, b.first);
-        }
-
-        key_compare kcmp;
-    };
-    pair_compare m_cmp;
-    container_type m_container;
-};
-
-template <typename Key, typename T, typename Compare, typename Container>
-bool operator==(const flat_map<Key, T, Compare, Container>& a, const flat_map<Key, T, Compare, Container>& b)
-{
-    return a.container() == b.container();
-}
-
-template <typename Key, typename T, typename Compare, typename Container>
-bool operator!=(const flat_map<Key, T, Compare, Container>& a, const flat_map<Key, T, Compare, Container>& b)
-{
-    return a.container() != b.container();
-}
-
-template <typename Key, typename T, typename Compare, typename Container>
-bool operator<(const flat_map<Key, T, Compare, Container>& a, const flat_map<Key, T, Compare, Container>& b)
-{
-	return a.container() < b.container();
-}
-
-}
-
-#if defined(CHOBO_FLAT_MAP_TEST_WITH_DOCTEST)
-
-#include <string>
-
-namespace chobo_flat_map_test
-{
-
-// struct with no operator==
-struct int_wrap
-{
-    int_wrap() = default;
-    int_wrap(int i) : val(i) {}
-    int val;
-
-    struct compare
-    {
-        bool operator()(const int_wrap& a, const int_wrap& b) const
-        {
-            return a.val < b.val;
-        }
-    };
-};
-
-}
-
-TEST_CASE("[flat_map] test")
-{
-    using namespace chobo;
-    using namespace chobo_flat_map_test;
-
-    flat_map<int, float> ifmap;
-    CHECK(ifmap.empty());
-    CHECK(ifmap.size() == 0);
-    CHECK(ifmap.capacity() == 0);
-    CHECK(ifmap.begin() == ifmap.end());
-
-    ifmap[1] = 3.2f;
-    CHECK(ifmap.size() == 1);
-
-    auto ifit = ifmap.begin();
-    CHECK(ifit->first == 1);
-    CHECK(ifit->second == 3.2f);
-    CHECK(ifmap[1] == 3.2f);
-    CHECK(ifmap.at(1) == 3.2f);
-    CHECK(ifmap.count(1) == 1);
-    CHECK(ifmap.count(5) == 0);
-
-    ++ifit;
-    CHECK(ifit == ifmap.end());
-
-    auto res = ifmap.insert(std::make_pair(6, 3.14f));
-    CHECK(res.second);
-    CHECK(res.first == ifmap.begin() + 1);
-
-    res = ifmap.emplace(3, 5.5f);
-    CHECK(res.second);
-    CHECK(res.first == ifmap.begin() + 1);
-
-    res = ifmap.emplace(6, 8.f);
-    CHECK(!res.second);
-    CHECK(res.first == ifmap.begin() + 2);
-
-    ifmap[2] = 5;
-    ifmap[52] = 15;
-    ifmap[12] = 1;
-    CHECK(ifmap.size() == 6);
-
-    auto cmp = [](const flat_map<int, float>::value_type& a, const flat_map<int, float>::value_type& b) -> bool
-    {
-        return a.first < b.first;
-    };
-
-    CHECK(std::is_sorted(ifmap.begin(), ifmap.end(), cmp));
-
-    ifmap.erase(12);
-    CHECK(ifmap.size() == 5);
-
-    CHECK(std::is_sorted(ifmap.begin(), ifmap.end(), cmp));
-
-    ifit = ifmap.find(12);
-    CHECK(ifit == ifmap.end());
-
-    ifit = ifmap.find(6);
-    CHECK(ifit != ifmap.end());
-    ifmap.erase(ifit);
-
-    CHECK(ifmap.size() == 4);
-    CHECK(std::is_sorted(ifmap.begin(), ifmap.end(), cmp));
-    ifit = ifmap.find(6);
-    CHECK(ifit == ifmap.end());
-
-    //
-
-    flat_map<std::string, int> simap;
-
-    CHECK(simap["123"] == 0);
-
-    CHECK(simap.begin()->first.c_str() == "123");
-
-    ++simap["asd"];
-
-    auto siit = simap.find("asd");
-    CHECK(siit != simap.end());
-    CHECK(siit->second == 1);
-    CHECK(siit == simap.begin() + 1);
-
-    CHECK(simap.count("bababa") == 0);
-    CHECK(simap.count("asd") == 1);
-
-    std::string asd = "asd";
-    CHECK(simap.at(asd) == simap.at("asd"));
-
-    simap["0The quick brown fox jumps over the lazy dog"] = 555;
-    CHECK(simap.begin()->first[1] == 'T');
-    const void* cstr = simap.begin()->first.c_str();
-
-    auto simap2 = std::move(simap);
-    CHECK(simap.empty());
-    CHECK(simap2.begin()->first.c_str() == cstr);
-
-    simap = std::move(simap2);
-    CHECK(simap2.empty());
-    CHECK(simap.begin()->first.c_str() == cstr);
-
-    CHECK(simap2 != simap);
-    simap2 = simap;
-    CHECK(simap2 == simap);
-
-    // no == comparable tests
-    flat_map<int_wrap, int, int_wrap::compare> iwmap;
-    iwmap[5] = 1;
-    iwmap[20] = 15;
-    iwmap[10] = 5;
-
-    auto iwi = iwmap.emplace(3, 4);
-    CHECK(iwi.second == true);
-    CHECK(iwi.first == iwmap.begin());
-
-    CHECK(iwmap.begin()->first.val == 3);
-    CHECK(iwmap.begin()->second == 4);
-    CHECK(iwmap.rbegin()->first.val == 20);
-    CHECK(iwmap.rbegin()->second == 15);
-    CHECK(iwmap.at(10) == 5);
-
-    iwi = iwmap.insert(std::pair<int_wrap, int>(11, 6));
-    CHECK(iwi.second == true);
-    CHECK(iwi.first + 2 == iwmap.end());
-
-    CHECK(iwmap[11] == 6);
-
-    iwi = iwmap.emplace(10, 55);
-    CHECK(iwi.second == false);
-    CHECK(iwi.first->second == 5);
-
-    CHECK(iwmap.find(18) == iwmap.end());
-    CHECK(iwmap.find(11) != iwmap.end());
-
-    const auto ciwmap = iwmap;
-
-    CHECK(ciwmap.begin()->first.val == 3);
-    CHECK(ciwmap.begin()->second == 4);
-    CHECK(ciwmap.rbegin()->first.val == 20);
-    CHECK(ciwmap.rbegin()->second == 15);
-    CHECK(ciwmap.at(10) == 5);
-
-    CHECK(ciwmap.find(18) == ciwmap.end());
-    CHECK(ciwmap.find(11) != ciwmap.end());
-
-    // swap
-    flat_map<int, int> m1, m2;
-    m1.reserve(10);
-    m1[1] = 2;
-    m1[2] = 5;
-    auto m1c = m1.capacity();
-
-    CHECK(m2.capacity() == 0);
-    m1.swap(m2);
-
-    CHECK(m2.size() == 2);
-    CHECK(m2.capacity() == m1c);
-    CHECK(m1.capacity() == 0);
-
-    // self usurp
-    m2 = m2;
-    CHECK(m2.size() == 2);
-    CHECK(m2.capacity() == m1c);
-}
-
-#if defined(CHOBO_FLAT_MAP_TEST_STATIC_VECTOR_WITH_DOCTEST)
-
-TEST_CASE("[flat_map] static_vector test")
-{
-    using namespace chobo;
-
-    flat_map<int, char, std::less<int>, static_vector<std::pair<int, char>, 10>> smap;
-    CHECK(smap.empty());
-    CHECK(smap.size() == 0);
-    CHECK(smap.capacity() == 10);
-    CHECK(smap.begin() == smap.end());
-
-    smap[1] = 3;
-    CHECK(smap.size() == 1);
-
-    auto ifit = smap.begin();
-    CHECK(ifit->first == 1);
-    CHECK(ifit->second == 3);
-    CHECK(smap[1] == 3);
-    CHECK(smap.at(1) == 3);
-    CHECK(smap.count(1) == 1);
-    CHECK(smap.count(5) == 0);
-
-    ++ifit;
-    CHECK(ifit == smap.end());
-
-    auto res = smap.insert(std::make_pair(6, 3));
-    CHECK(res.second);
-    CHECK(res.first == smap.begin() + 1);
-
-    res = smap.emplace(3, 5);
-    CHECK(res.second);
-    CHECK(res.first == smap.begin() + 1);
-
-    res = smap.emplace(6, 8);
-    CHECK(!res.second);
-    CHECK(res.first == smap.begin() + 2);
-
-    smap[2] = 5;
-    smap[52] = 15;
-    smap[12] = 1;
-    CHECK(smap.size() == 6);
-
-    auto cmp = [](const flat_map<int, float>::value_type& a, const flat_map<int, float>::value_type& b) -> bool
-    {
-        return a.first < b.first;
-    };
-
-    CHECK(std::is_sorted(smap.begin(), smap.end(), cmp));
-
-    smap.erase(12);
-    CHECK(smap.size() == 5);
-
-    CHECK(std::is_sorted(smap.begin(), smap.end(), cmp));
-
-    ifit = smap.find(12);
-    CHECK(ifit == smap.end());
-
-    ifit = smap.find(6);
-    CHECK(ifit != smap.end());
-    smap.erase(ifit);
-
-    CHECK(smap.size() == 4);
-    CHECK(std::is_sorted(smap.begin(), smap.end(), cmp));
-    ifit = smap.find(6);
-    CHECK(ifit == smap.end());
-}
-
-#endif
-
-#if defined(CHOBO_FLAT_MAP_TEST_VECTOR_PTR_WITH_DOCTEST)
-
-TEST_CASE("[flat_map] vector_ptr test")
-{
-    using namespace chobo;
-    flat_map<int, char, std::less<int>, vector_ptr<std::pair<int, char>>> smap;
-
-    std::vector<std::pair<int, char>> vec;
-    smap.modify_container().reset(&vec);
-
-    smap[1] = '1';
-    smap[3] = '3';
-
-    CHECK(smap.at(3) == '3');
-
-    auto smap2 = smap;
-    CHECK(smap2.size() == 2);
-    CHECK(smap2[1] == '1');
-    CHECK(smap2.at(3) == '3');
-
-    smap2[0] = '0';
-
-    CHECK(smap.size() == 3);
-    CHECK(smap[0] == '0');
-
-    smap.clear();
-
-    CHECK(smap2.empty());
-}
-
-#endif
-
-
-#endif
-
diff --git a/libvpk++.h b/libvpk++.h
index b1ea712..ac9cef6 100644
--- a/libvpk++.h
+++ b/libvpk++.h
@@ -1,10 +1,11 @@
 #pragma once
 
+#include <cstdint>
 #include <string>
 #include <string_view>
-#include <cstdint>
+#include <vector>
 
-#include "flat_map.hpp"
+#include "tuple.hpp"
 #include "IStream.h"
 #include "StreamUtils.h"
 
@@ -79,7 +80,7 @@ namespace libvpk
 		uint32_t crc;
 	};
 
-	using VPKFileMap = chobo::flat_map<std::string, VPKFileDesc>;
+	using VPKFileList = std::vector<tuplet::pair<std::string, VPKFileDesc>>;
 
 	class VPKSet : private helpers::NonCopyable
 	{
@@ -107,9 +108,19 @@ namespace libvpk
 			else
 				return S_FALSE;
 
+			RINOK( stream->Seek( pos, STREAM_SEEK_SET, nullptr ) );
+			CMyComPtr<CLimitedCachedInStream> bufStream = new CLimitedCachedInStream();
+			const auto size = m_header.treeSize + ( initialHeader.version == 2 ? sizeof( meta::VPKHeader2 ) : sizeof( meta::VPKHeader1 ) );
+			bufStream->Buffer.Alloc( size );
+			RINOK( ReadStream_FALSE( stream, bufStream->Buffer, size ) );
+			bufStream->SetStream( stream, 0 );
+			bufStream->SetCache( size, 0 );
+			bufStream->InitAndSeek( 0, size );
+			RINOK( bufStream->Seek( initialHeader.version == 2 ? sizeof( meta::VPKHeader2 ) : sizeof( meta::VPKHeader1 ), STREAM_SEEK_SET, nullptr ) );
+
 			const UInt64 total = m_header.treeSize;
 			callback->SetTotal( nullptr, &total );
-			return parseDirectory( stream, callback );
+			return parseDirectory( bufStream, callback );
 		}
 
 		const meta::VPKHeader& header() const
@@ -117,7 +128,7 @@ namespace libvpk
 			return m_header;
 		}
 
-		const VPKFileMap& files() const
+		const VPKFileList& files() const
 		{
 			return m_files;
 		}
@@ -166,7 +177,7 @@ namespace libvpk
 						if ( extension != " "sv )
 							fullPath += '.' + extension;
 
-						RINOK( parseFile( stream, fullPath ) );
+						RINOK( parseFile( stream, std::move( fullPath ) ) );
 					}
 				}
 			}
@@ -174,7 +185,7 @@ namespace libvpk
 			return S_OK;
 		}
 
-		HRESULT parseFile( IInStream* stream, const std::string& vpkFilePath )
+		HRESULT parseFile( IInStream* stream, std::string vpkFilePath )
 		{
 			uint32_t crc = helpers::read<uint32_t>( stream );
 			uint16_t preloadBytes = helpers::read<uint16_t>( stream );
@@ -189,6 +200,7 @@ namespace libvpk
 
 			UInt64 pos;
 			RINOK( stream->Seek( 0, STREAM_SEEK_CUR, &pos ) );
+
 			VPKFileDesc desc{ archiveIndex, preloadBytes, static_cast<uint32_t>( pos ), offset, length, crc };
 
 			// Skip over the preload section
@@ -198,11 +210,11 @@ namespace libvpk
 				desc.fileLength += desc.preloadLength;
 			}
 
-			m_files.emplace( vpkFilePath, desc );
+			m_files.emplace_back().assign( std::move( vpkFilePath ), desc );
 			return S_OK;
 		}
 
 		meta::VPKHeader m_header;
-		VPKFileMap m_files;
+		VPKFileList m_files;
 	};
 }
\ No newline at end of file
diff --git a/robin_hood.h b/robin_hood.h
new file mode 100644
index 0000000..93bf5fc
--- /dev/null
+++ b/robin_hood.h
@@ -0,0 +1,2531 @@
+//                 ______  _____                 ______                _________
+//  ______________ ___  /_ ___(_)_______         ___  /_ ______ ______ ______  /
+//  __  ___/_  __ \__  __ \__  / __  __ \        __  __ \_  __ \_  __ \_  __  /
+//  _  /    / /_/ /_  /_/ /_  /  _  / / /        _  / / // /_/ // /_/ // /_/ /
+//  /_/     \____/ /_.___/ /_/   /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/
+//                                      _/_____/
+//
+// Fast & memory efficient hashtable based on robin hood hashing for C++11/14/17/20
+// https://github.com/martinus/robin-hood-hashing
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2021 Martin Ankerl <http://martin.ankerl.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ROBIN_HOOD_H_INCLUDED
+#define ROBIN_HOOD_H_INCLUDED
+
+// see https://semver.org/
+#define ROBIN_HOOD_VERSION_MAJOR 3  // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR 11 // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 3  // for backwards-compatible bug fixes
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory> // only to support hash of smart pointers
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#if __cplusplus >= 201703L
+#    include <string_view>
+#endif
+
+// #define ROBIN_HOOD_LOG_ENABLED
+#ifdef ROBIN_HOOD_LOG_ENABLED
+#    include <iostream>
+#    define ROBIN_HOOD_LOG(...) \
+        std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ << std::endl;
+#else
+#    define ROBIN_HOOD_LOG(x)
+#endif
+
+// #define ROBIN_HOOD_TRACE_ENABLED
+#ifdef ROBIN_HOOD_TRACE_ENABLED
+#    include <iostream>
+#    define ROBIN_HOOD_TRACE(...) \
+        std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ << std::endl;
+#else
+#    define ROBIN_HOOD_TRACE(x)
+#endif
+
+// #define ROBIN_HOOD_COUNT_ENABLED
+#ifdef ROBIN_HOOD_COUNT_ENABLED
+#    include <iostream>
+#    define ROBIN_HOOD_COUNT(x) ++counts().x;
+namespace robin_hood {
+struct Counts {
+    uint64_t shiftUp{};
+    uint64_t shiftDown{};
+};
+inline std::ostream& operator<<(std::ostream& os, Counts const& c) {
+    return os << c.shiftUp << " shiftUp" << std::endl << c.shiftDown << " shiftDown" << std::endl;
+}
+
+static Counts& counts() {
+    static Counts counts{};
+    return counts;
+}
+} // namespace robin_hood
+#else
+#    define ROBIN_HOOD_COUNT(x)
+#endif
+
+// all non-argument macros should use this facility. See
+// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/
+#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x()
+
+// mark unused members with this macro
+#define ROBIN_HOOD_UNUSED(identifier)
+
+// bitness
+#if SIZE_MAX == UINT32_MAX
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32
+#elif SIZE_MAX == UINT64_MAX
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64
+#else
+#    error Unsupported bitness
+#endif
+
+// endianess
+#ifdef _MSC_VER
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \
+        (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#endif
+
+// inline
+#ifdef _MSC_VER
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline)
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline))
+#endif
+
+// exceptions
+#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1
+#endif
+
+// count leading/trailing bits
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#    ifdef _MSC_VER
+#        if ROBIN_HOOD(BITNESS) == 32
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#        else
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#        endif
+#        include <intrin.h>
+#        pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#        define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                       \
+            [](size_t mask) noexcept -> int {                                             \
+                unsigned long index;                                                      \
+                return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
+                                                                : ROBIN_HOOD(BITNESS);    \
+            }(x)
+#    else
+#        if ROBIN_HOOD(BITNESS) == 32
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#        else
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#        endif
+#        define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#        define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
+#    endif
+#endif
+
+// fallthrough
+#ifndef __has_cpp_attribute // For backwards compatibility
+#    define __has_cpp_attribute(x) 0
+#endif
+#if __has_cpp_attribute(clang::fallthrough)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]]
+#elif __has_cpp_attribute(fallthrough)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[fallthrough]]
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH()
+#endif
+
+// likely/unlikely
+#ifdef _MSC_VER
+#    define ROBIN_HOOD_LIKELY(condition) condition
+#    define ROBIN_HOOD_UNLIKELY(condition) condition
+#else
+#    define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1)
+#    define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
+#endif
+
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#    ifdef _NATIVE_WCHAR_T_DEFINED
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#    else
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#    endif
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor being constexpr
+#ifdef _MSC_VER
+#    if _MSC_VER <= 1900
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1
+#    else
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#    endif
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#    define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#    define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// helpers for C++ versions, see https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]]
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD()
+#endif
+
+namespace robin_hood {
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+#    define ROBIN_HOOD_STD std
+#else
+
+// c++11 compatibility layer
+namespace ROBIN_HOOD_STD {
+template <class T>
+struct alignment_of
+    : std::integral_constant<std::size_t, alignof(typename std::remove_all_extents<T>::type)> {};
+
+template <class T, T... Ints>
+class integer_sequence {
+public:
+    using value_type = T;
+    static_assert(std::is_integral<value_type>::value, "not integral type");
+    static constexpr std::size_t size() noexcept {
+        return sizeof...(Ints);
+    }
+};
+template <std::size_t... Inds>
+using index_sequence = integer_sequence<std::size_t, Inds...>;
+
+namespace detail_ {
+template <class T, T Begin, T End, bool>
+struct IntSeqImpl {
+    using TValue = T;
+    static_assert(std::is_integral<TValue>::value, "not integral type");
+    static_assert(Begin >= 0 && Begin < End, "unexpected argument (Begin<0 || Begin<=End)");
+
+    template <class, class>
+    struct IntSeqCombiner;
+
+    template <TValue... Inds0, TValue... Inds1>
+    struct IntSeqCombiner<integer_sequence<TValue, Inds0...>, integer_sequence<TValue, Inds1...>> {
+        using TResult = integer_sequence<TValue, Inds0..., Inds1...>;
+    };
+
+    using TResult =
+        typename IntSeqCombiner<typename IntSeqImpl<TValue, Begin, Begin + (End - Begin) / 2,
+                                                    (End - Begin) / 2 == 1>::TResult,
+                                typename IntSeqImpl<TValue, Begin + (End - Begin) / 2, End,
+                                                    (End - Begin + 1) / 2 == 1>::TResult>::TResult;
+};
+
+template <class T, T Begin>
+struct IntSeqImpl<T, Begin, Begin, false> {
+    using TValue = T;
+    static_assert(std::is_integral<TValue>::value, "not integral type");
+    static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+    using TResult = integer_sequence<TValue>;
+};
+
+template <class T, T Begin, T End>
+struct IntSeqImpl<T, Begin, End, true> {
+    using TValue = T;
+    static_assert(std::is_integral<TValue>::value, "not integral type");
+    static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+    using TResult = integer_sequence<TValue, Begin>;
+};
+} // namespace detail_
+
+template <class T, T N>
+using make_integer_sequence = typename detail_::IntSeqImpl<T, 0, N, (N - 0) == 1>::TResult;
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+} // namespace ROBIN_HOOD_STD
+
+#endif
+
+namespace detail {
+
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
+#else
+using SizeT = uint32_t;
+#endif
+
+template <typename T>
+T rotr(T x, unsigned k) {
+    return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
+// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned char*'} to
+// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target type". Use with
+// care!
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void* ptr) noexcept {
+    return reinterpret_cast<T>(ptr);
+}
+
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void const* ptr) noexcept {
+    return reinterpret_cast<T>(ptr);
+}
+
+// make sure this is not inlined as it is slow and dramatically enlarges code, thus making other
+// inlinings more difficult. Throws are also generally the slow path.
+template <typename E, typename... Args>
+[[noreturn]] ROBIN_HOOD(NOINLINE)
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    void doThrow(Args&&... args) {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+    throw E(std::forward<Args>(args)...);
+}
+#else
+    void doThrow(Args&&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+    abort();
+}
+#endif
+
+template <typename E, typename T, typename... Args>
+T* assertNotNull(T* t, Args&&... args) {
+    if (ROBIN_HOOD_UNLIKELY(nullptr == t)) {
+        doThrow<E>(std::forward<Args>(args)...);
+    }
+    return t;
+}
+
+template <typename T>
+inline T unaligned_load(void const* ptr) noexcept {
+    // using memcpy so we don't get into unaligned load problems.
+    // compiler should optimize this very well anyways.
+    T t;
+    std::memcpy(&t, ptr, sizeof(T));
+    return t;
+}
+
+// Allocates bulks of memory for objects of type T. This deallocates the memory in the destructor,
+// and keeps a linked list of the allocated memory around. Overhead per allocation is the size of a
+// pointer.
+template <typename T, size_t MinNumAllocs = 4, size_t MaxNumAllocs = 256>
+class BulkPoolAllocator {
+public:
+    BulkPoolAllocator() noexcept = default;
+
+    // does not copy anything, just creates a new allocator.
+    BulkPoolAllocator(const BulkPoolAllocator& ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept
+        : mHead(nullptr)
+        , mListForFree(nullptr) {}
+
+    BulkPoolAllocator(BulkPoolAllocator&& o) noexcept
+        : mHead(o.mHead)
+        , mListForFree(o.mListForFree) {
+        o.mListForFree = nullptr;
+        o.mHead = nullptr;
+    }
+
+    BulkPoolAllocator& operator=(BulkPoolAllocator&& o) noexcept {
+        reset();
+        mHead = o.mHead;
+        mListForFree = o.mListForFree;
+        o.mListForFree = nullptr;
+        o.mHead = nullptr;
+        return *this;
+    }
+
+    BulkPoolAllocator&
+    // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+    operator=(const BulkPoolAllocator& ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept {
+        // does not do anything
+        return *this;
+    }
+
+    ~BulkPoolAllocator() noexcept {
+        reset();
+    }
+
+    // Deallocates all allocated memory.
+    void reset() noexcept {
+        while (mListForFree) {
+            T* tmp = *mListForFree;
+            ROBIN_HOOD_LOG("std::free")
+            std::free(mListForFree);
+            mListForFree = reinterpret_cast_no_cast_align_warning<T**>(tmp);
+        }
+        mHead = nullptr;
+    }
+
+    // allocates, but does NOT initialize. Use in-place new constructor, e.g.
+    //   T* obj = pool.allocate();
+    //   ::new (static_cast<void*>(obj)) T();
+    T* allocate() {
+        T* tmp = mHead;
+        if (!tmp) {
+            tmp = performAllocation();
+        }
+
+        mHead = *reinterpret_cast_no_cast_align_warning<T**>(tmp);
+        return tmp;
+    }
+
+    // does not actually deallocate but puts it in store.
+    // make sure you have already called the destructor! e.g. with
+    //  obj->~T();
+    //  pool.deallocate(obj);
+    void deallocate(T* obj) noexcept {
+        *reinterpret_cast_no_cast_align_warning<T**>(obj) = mHead;
+        mHead = obj;
+    }
+
+    // Adds an already allocated block of memory to the allocator. This allocator is from now on
+    // responsible for freeing the data (with free()). If the provided data is not large enough to
+    // make use of, it is immediately freed. Otherwise it is reused and freed in the destructor.
+    void addOrFree(void* ptr, const size_t numBytes) noexcept {
+        // calculate number of available elements in ptr
+        if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
+            // not enough data for at least one element. Free and return.
+            ROBIN_HOOD_LOG("std::free")
+            std::free(ptr);
+        } else {
+            ROBIN_HOOD_LOG("add to buffer")
+            add(ptr, numBytes);
+        }
+    }
+
+    void swap(BulkPoolAllocator<T, MinNumAllocs, MaxNumAllocs>& other) noexcept {
+        using std::swap;
+        swap(mHead, other.mHead);
+        swap(mListForFree, other.mListForFree);
+    }
+
+private:
+    // iterates the list of allocated memory to calculate how many to alloc next.
+    // Recalculating this each time saves us a size_t member.
+    // This ignores the fact that memory blocks might have been added manually with addOrFree. In
+    // practice, this should not matter much.
+    ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept {
+        auto tmp = mListForFree;
+        size_t numAllocs = MinNumAllocs;
+
+        while (numAllocs * 2 <= MaxNumAllocs && tmp) {
+            auto x = reinterpret_cast<T***>(tmp);
+            tmp = *x;
+            numAllocs *= 2;
+        }
+
+        return numAllocs;
+    }
+
+    // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree().
+    void add(void* ptr, const size_t numBytes) noexcept {
+        const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE;
+
+        auto data = reinterpret_cast<T**>(ptr);
+
+        // link free list
+        auto x = reinterpret_cast<T***>(data);
+        *x = mListForFree;
+        mListForFree = data;
+
+        // create linked list for newly allocated data
+        auto* const headT =
+            reinterpret_cast_no_cast_align_warning<T*>(reinterpret_cast<char*>(ptr) + ALIGNMENT);
+
+        auto* const head = reinterpret_cast<char*>(headT);
+
+        // Visual Studio compiler automatically unrolls this loop, which is pretty cool
+        for (size_t i = 0; i < numElements; ++i) {
+            *reinterpret_cast_no_cast_align_warning<char**>(head + i * ALIGNED_SIZE) =
+                head + (i + 1) * ALIGNED_SIZE;
+        }
+
+        // last one points to 0
+        *reinterpret_cast_no_cast_align_warning<T**>(head + (numElements - 1) * ALIGNED_SIZE) =
+            mHead;
+        mHead = headT;
+    }
+
+    // Called when no memory is available (mHead == 0).
+    // Don't inline this slow path.
+    ROBIN_HOOD(NOINLINE) T* performAllocation() {
+        size_t const numElementsToAlloc = calcNumElementsToAlloc();
+
+        // alloc new memory: [prev |T, T, ... T]
+        size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
+        ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + " << ALIGNED_SIZE
+                                      << " * " << numElementsToAlloc)
+        add(assertNotNull<std::bad_alloc>(std::malloc(bytes)), bytes);
+        return mHead;
+    }
+
+    // enforce byte alignment of the T's
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+    static constexpr size_t ALIGNMENT =
+        (std::max)(std::alignment_of<T>::value, std::alignment_of<T*>::value);
+#else
+    static const size_t ALIGNMENT =
+        (ROBIN_HOOD_STD::alignment_of<T>::value > ROBIN_HOOD_STD::alignment_of<T*>::value)
+            ? ROBIN_HOOD_STD::alignment_of<T>::value
+            : +ROBIN_HOOD_STD::alignment_of<T*>::value; // the + is for walkarround
+#endif
+
+    static constexpr size_t ALIGNED_SIZE = ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT;
+
+    static_assert(MinNumAllocs >= 1, "MinNumAllocs");
+    static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs");
+    static_assert(ALIGNED_SIZE >= sizeof(T*), "ALIGNED_SIZE");
+    static_assert(0 == (ALIGNED_SIZE % sizeof(T*)), "ALIGNED_SIZE mod");
+    static_assert(ALIGNMENT >= sizeof(T*), "ALIGNMENT");
+
+    T* mHead{nullptr};
+    T** mListForFree{nullptr};
+};
+
+template <typename T, size_t MinSize, size_t MaxSize, bool IsFlat>
+struct NodeAllocator;
+
+// dummy allocator that does nothing
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, true> {
+
+    // we are not using the data, so just free it.
+    void addOrFree(void* ptr, size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
+        ROBIN_HOOD_LOG("std::free")
+        std::free(ptr);
+    }
+};
+
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, false> : public BulkPoolAllocator<T, MinSize, MaxSize> {};
+
+// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it either, so I'm making
+// my own here.
+namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
+using std::swap;
+template <typename T>
+struct nothrow {
+    static const bool value = noexcept(swap(std::declval<T&>(), std::declval<T&>()));
+};
+#else
+template <typename T>
+struct nothrow {
+    static const bool value = std::is_nothrow_swappable<T>::value;
+};
+#endif
+} // namespace swappable
+
+} // namespace detail
+
+struct is_transparent_tag {};
+
+// A custom pair implementation is used in the map because std::pair is not is_trivially_copyable,
+// which means it would  not be allowed to be used in std::memcpy. This struct is copyable, which is
+// also tested.
+template <typename T1, typename T2>
+struct pair {
+    using first_type = T1;
+    using second_type = T2;
+
+    template <typename U1 = T1, typename U2 = T2,
+              typename = typename std::enable_if<std::is_default_constructible<U1>::value &&
+                                                 std::is_default_constructible<U2>::value>::type>
+    constexpr pair() noexcept(noexcept(U1()) && noexcept(U2()))
+        : first()
+        , second() {}
+
+    // pair constructors are explicit so we don't accidentally call this ctor when we don't have to.
+    explicit constexpr pair(std::pair<T1, T2> const& o) noexcept(
+        noexcept(T1(std::declval<T1 const&>())) && noexcept(T2(std::declval<T2 const&>())))
+        : first(o.first)
+        , second(o.second) {}
+
+    // pair constructors are explicit so we don't accidentally call this ctor when we don't have to.
+    explicit constexpr pair(std::pair<T1, T2>&& o) noexcept(noexcept(
+        T1(std::move(std::declval<T1&&>()))) && noexcept(T2(std::move(std::declval<T2&&>()))))
+        : first(std::move(o.first))
+        , second(std::move(o.second)) {}
+
+    constexpr pair(T1&& a, T2&& b) noexcept(noexcept(
+        T1(std::move(std::declval<T1&&>()))) && noexcept(T2(std::move(std::declval<T2&&>()))))
+        : first(std::move(a))
+        , second(std::move(b)) {}
+
+    template <typename U1, typename U2>
+    constexpr pair(U1&& a, U2&& b) noexcept(noexcept(T1(std::forward<U1>(
+        std::declval<U1&&>()))) && noexcept(T2(std::forward<U2>(std::declval<U2&&>()))))
+        : first(std::forward<U1>(a))
+        , second(std::forward<U2>(b)) {}
+
+    template <typename... U1, typename... U2>
+    // MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize all members"
+    // if this constructor is constexpr
+#if !ROBIN_HOOD(BROKEN_CONSTEXPR)
+    constexpr
+#endif
+        pair(std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
+             std::tuple<U2...>
+                 b) noexcept(noexcept(pair(std::declval<std::tuple<U1...>&>(),
+                                           std::declval<std::tuple<U2...>&>(),
+                                           ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+                                           ROBIN_HOOD_STD::index_sequence_for<U2...>())))
+        : pair(a, b, ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+               ROBIN_HOOD_STD::index_sequence_for<U2...>()) {
+    }
+
+    // constructor called from the std::piecewise_construct_t ctor
+    template <typename... U1, size_t... I1, typename... U2, size_t... I2>
+    pair(std::tuple<U1...>& a, std::tuple<U2...>& b, ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/, ROBIN_HOOD_STD::index_sequence<I2...> /*unused*/) noexcept(
+        noexcept(T1(std::forward<U1>(std::get<I1>(
+            std::declval<std::tuple<
+                U1...>&>()))...)) && noexcept(T2(std::
+                                                     forward<U2>(std::get<I2>(
+                                                         std::declval<std::tuple<U2...>&>()))...)))
+        : first(std::forward<U1>(std::get<I1>(a))...)
+        , second(std::forward<U2>(std::get<I2>(b))...) {
+        // make visual studio compiler happy about warning about unused a & b.
+        // Visual studio's pair implementation disables warning 4100.
+        (void)a;
+        (void)b;
+    }
+
+    void swap(pair<T1, T2>& o) noexcept((detail::swappable::nothrow<T1>::value) &&
+                                        (detail::swappable::nothrow<T2>::value)) {
+        using std::swap;
+        swap(first, o.first);
+        swap(second, o.second);
+    }
+
+    T1 first;  // NOLINT(misc-non-private-member-variables-in-classes)
+    T2 second; // NOLINT(misc-non-private-member-variables-in-classes)
+};
+
+template <typename A, typename B>
+inline void swap(pair<A, B>& a, pair<A, B>& b) noexcept(
+    noexcept(std::declval<pair<A, B>&>().swap(std::declval<pair<A, B>&>()))) {
+    a.swap(b);
+}
+
+template <typename A, typename B>
+inline constexpr bool operator==(pair<A, B> const& x, pair<A, B> const& y) {
+    return (x.first == y.first) && (x.second == y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator!=(pair<A, B> const& x, pair<A, B> const& y) {
+    return !(x == y);
+}
+template <typename A, typename B>
+inline constexpr bool operator<(pair<A, B> const& x, pair<A, B> const& y) noexcept(noexcept(
+    std::declval<A const&>() < std::declval<A const&>()) && noexcept(std::declval<B const&>() <
+                                                                     std::declval<B const&>())) {
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator>(pair<A, B> const& x, pair<A, B> const& y) {
+    return y < x;
+}
+template <typename A, typename B>
+inline constexpr bool operator<=(pair<A, B> const& x, pair<A, B> const& y) {
+    return !(x > y);
+}
+template <typename A, typename B>
+inline constexpr bool operator>=(pair<A, B> const& x, pair<A, B> const& y) {
+    return !(x < y);
+}
+
+inline size_t hash_bytes(void const* ptr, size_t len) noexcept {
+    static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
+    static constexpr uint64_t seed = UINT64_C(0xe17a1465);
+    static constexpr unsigned int r = 47;
+
+    auto const* const data64 = static_cast<uint64_t const*>(ptr);
+    uint64_t h = seed ^ (len * m);
+
+    size_t const n_blocks = len / 8;
+    for (size_t i = 0; i < n_blocks; ++i) {
+        auto k = detail::unaligned_load<uint64_t>(data64 + i);
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
+    switch (len & 7U) {
+    case 7:
+        h ^= static_cast<uint64_t>(data8[6]) << 48U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 6:
+        h ^= static_cast<uint64_t>(data8[5]) << 40U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 5:
+        h ^= static_cast<uint64_t>(data8[4]) << 32U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 4:
+        h ^= static_cast<uint64_t>(data8[3]) << 24U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 3:
+        h ^= static_cast<uint64_t>(data8[2]) << 16U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 2:
+        h ^= static_cast<uint64_t>(data8[1]) << 8U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 1:
+        h ^= static_cast<uint64_t>(data8[0]);
+        h *= m;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    default:
+        break;
+    }
+
+    h ^= h >> r;
+
+    // not doing the final step here, because this will be done by keyToIdx anyways
+    // h *= m;
+    // h ^= h >> r;
+    return static_cast<size_t>(h);
+}
+
+inline size_t hash_int(uint64_t x) noexcept {
+    // tried lots of different hashes, let's stick with murmurhash3. It's simple, fast, well tested,
+    // and doesn't need any special 128bit operations.
+    x ^= x >> 33U;
+    x *= UINT64_C(0xff51afd7ed558ccd);
+    x ^= x >> 33U;
+
+    // not doing the final step here, because this will be done by keyToIdx anyways
+    // x *= UINT64_C(0xc4ceb9fe1a85ec53);
+    // x ^= x >> 33U;
+    return static_cast<size_t>(x);
+}
+
+// A thin wrapper around std::hash, performing an additional simple mixing step of the result.
+template <typename T, typename Enable = void>
+struct hash : public std::hash<T> {
+    size_t operator()(T const& obj) const
+        noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>()))) {
+        // call base hash
+        auto result = std::hash<T>::operator()(obj);
+        // return mixed of that, to be save against identity has
+        return hash_int(static_cast<detail::SizeT>(result));
+    }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+    size_t operator()(std::basic_string<CharT> const& str) const noexcept {
+        return hash_bytes(str.data(), sizeof(CharT) * str.size());
+    }
+};
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+    size_t operator()(std::basic_string_view<CharT> const& sv) const noexcept {
+        return hash_bytes(sv.data(), sizeof(CharT) * sv.size());
+    }
+};
+#endif
+
+template <class T>
+struct hash<T*> {
+    size_t operator()(T* ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr));
+    }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+    size_t operator()(std::unique_ptr<T> const& ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+    }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+    size_t operator()(std::shared_ptr<T> const& ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+    }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+    size_t operator()(Enum e) const noexcept {
+        using Underlying = typename std::underlying_type<Enum>::type;
+        return hash<Underlying>{}(static_cast<Underlying>(e));
+    }
+};
+
+#define ROBIN_HOOD_HASH_INT(T)                           \
+    template <>                                          \
+    struct hash<T> {                                     \
+        size_t operator()(T const& obj) const noexcept { \
+            return hash_int(static_cast<uint64_t>(obj)); \
+        }                                                \
+    }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ROBIN_HOOD_HASH_INT(bool);
+ROBIN_HOOD_HASH_INT(char);
+ROBIN_HOOD_HASH_INT(signed char);
+ROBIN_HOOD_HASH_INT(unsigned char);
+ROBIN_HOOD_HASH_INT(char16_t);
+ROBIN_HOOD_HASH_INT(char32_t);
+#if ROBIN_HOOD(HAS_NATIVE_WCHART)
+ROBIN_HOOD_HASH_INT(wchar_t);
+#endif
+ROBIN_HOOD_HASH_INT(short);
+ROBIN_HOOD_HASH_INT(unsigned short);
+ROBIN_HOOD_HASH_INT(int);
+ROBIN_HOOD_HASH_INT(unsigned int);
+ROBIN_HOOD_HASH_INT(long);
+ROBIN_HOOD_HASH_INT(long long);
+ROBIN_HOOD_HASH_INT(unsigned long);
+ROBIN_HOOD_HASH_INT(unsigned long long);
+#if defined(__GNUC__) && !defined(__clang__)
+#    pragma GCC diagnostic pop
+#endif
+namespace detail {
+
+template <typename T>
+struct void_type {
+    using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : public std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T, typename void_type<typename T::is_transparent>::type>
+    : public std::true_type {};
+
+// using wrapper classes for hash and key_equal prevents the diamond problem when the same type
+// is used. see https://stackoverflow.com/a/28771920/48181
+template <typename T>
+struct WrapHash : public T {
+    WrapHash() = default;
+    explicit WrapHash(T const& o) noexcept(noexcept(T(std::declval<T const&>())))
+        : T(o) {}
+};
+
+template <typename T>
+struct WrapKeyEqual : public T {
+    WrapKeyEqual() = default;
+    explicit WrapKeyEqual(T const& o) noexcept(noexcept(T(std::declval<T const&>())))
+        : T(o) {}
+};
+
+// A highly optimized hashmap implementation, using the Robin Hood algorithm.
+//
+// In most cases, this map should be usable as a drop-in replacement for std::unordered_map, but
+// be about 2x faster in most cases and require much less allocations.
+//
+// This implementation uses the following memory layout:
+//
+// [Node, Node, ... Node | info, info, ... infoSentinel ]
+//
+// * Node: either a DataNode that directly has the std::pair<key, val> as member,
+//   or a DataNode with a pointer to std::pair<key,val>. Which DataNode representation to use
+//   depends on how fast the swap() operation is. Heuristically, this is automatically choosen
+//   based on sizeof(). there are always 2^n Nodes.
+//
+// * info: Each Node in the map has a corresponding info byte, so there are 2^n info bytes.
+//   Each byte is initialized to 0, meaning the corresponding Node is empty. Set to 1 means the
+//   corresponding node contains data. Set to 2 means the corresponding Node is filled, but it
+//   actually belongs to the previous position and was pushed out because that place is already
+//   taken.
+//
+// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at end() without the
+//   need for a idx variable.
+//
+// According to STL, order of templates has effect on throughput. That's why I've moved the
+// boolean to the front.
+// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/
+template <bool IsFlat, size_t MaxLoadFactor100, typename Key, typename T, typename Hash,
+          typename KeyEqual>
+class Table
+    : public WrapHash<Hash>,
+      public WrapKeyEqual<KeyEqual>,
+      detail::NodeAllocator<
+          typename std::conditional<
+              std::is_void<T>::value, Key,
+              robin_hood::pair<typename std::conditional<IsFlat, Key, Key const>::type, T>>::type,
+          4, 16384, IsFlat> {
+public:
+    static constexpr bool is_flat = IsFlat;
+    static constexpr bool is_map = !std::is_void<T>::value;
+    static constexpr bool is_set = !is_map;
+    static constexpr bool is_transparent =
+        has_is_transparent<Hash>::value && has_is_transparent<KeyEqual>::value;
+
+    using key_type = Key;
+    using mapped_type = T;
+    using value_type = typename std::conditional<
+        is_set, Key,
+        robin_hood::pair<typename std::conditional<is_flat, Key, Key const>::type, T>>::type;
+    using size_type = size_t;
+    using hasher = Hash;
+    using key_equal = KeyEqual;
+    using Self = Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+
+private:
+    static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100,
+                  "MaxLoadFactor100 needs to be >10 && < 100");
+
+    using WHash = WrapHash<Hash>;
+    using WKeyEqual = WrapKeyEqual<KeyEqual>;
+
+    // configuration defaults
+
+    // make sure we have 8 elements, needed to quickly rehash mInfo
+    static constexpr size_t InitialNumElements = sizeof(uint64_t);
+    static constexpr uint32_t InitialInfoNumBits = 5;
+    static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits;
+    static constexpr size_t InfoMask = InitialInfoInc - 1U;
+    static constexpr uint8_t InitialInfoHashShift = 0;
+    using DataPool = detail::NodeAllocator<value_type, 4, 16384, IsFlat>;
+
+    // type needs to be wider than uint8_t.
+    using InfoType = uint32_t;
+
+    // DataNode ////////////////////////////////////////////////////////
+
+    // Primary template for the data node. We have special implementations for small and big
+    // objects. For large objects it is assumed that swap() is fairly slow, so we allocate these
+    // on the heap so swap merely swaps a pointer.
+    template <typename M, bool>
+    class DataNode {};
+
+    // Small: just allocate on the stack.
+    template <typename M>
+    class DataNode<M, true> final {
+    public:
+        template <typename... Args>
+        explicit DataNode(M& ROBIN_HOOD_UNUSED(map) /*unused*/, Args&&... args) noexcept(
+            noexcept(value_type(std::forward<Args>(args)...)))
+            : mData(std::forward<Args>(args)...) {}
+
+        DataNode(M& ROBIN_HOOD_UNUSED(map) /*unused*/, DataNode<M, true>&& n) noexcept(
+            std::is_nothrow_move_constructible<value_type>::value)
+            : mData(std::move(n.mData)) {}
+
+        // doesn't do anything
+        void destroy(M& ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {}
+        void destroyDoNotDeallocate() noexcept {}
+
+        value_type const* operator->() const noexcept {
+            return &mData;
+        }
+        value_type* operator->() noexcept {
+            return &mData;
+        }
+
+        const value_type& operator*() const noexcept {
+            return mData;
+        }
+
+        value_type& operator*() noexcept {
+            return mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type&>::type getFirst() noexcept {
+            return mData.first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT&>::type getFirst() noexcept {
+            return mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type const&>::type
+            getFirst() const noexcept {
+            return mData.first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT const&>::type getFirst() const noexcept {
+            return mData;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, MT&>::type getSecond() noexcept {
+            return mData.second;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, MT const&>::type getSecond() const noexcept {
+            return mData.second;
+        }
+
+        void swap(DataNode<M, true>& o) noexcept(
+            noexcept(std::declval<value_type>().swap(std::declval<value_type>()))) {
+            mData.swap(o.mData);
+        }
+
+    private:
+        value_type mData;
+    };
+
+    // big object: allocate on heap.
+    template <typename M>
+    class DataNode<M, false> {
+    public:
+        template <typename... Args>
+        explicit DataNode(M& map, Args&&... args)
+            : mData(map.allocate()) {
+            ::new (static_cast<void*>(mData)) value_type(std::forward<Args>(args)...);
+        }
+
+        DataNode(M& ROBIN_HOOD_UNUSED(map) /*unused*/, DataNode<M, false>&& n) noexcept
+            : mData(std::move(n.mData)) {}
+
+        void destroy(M& map) noexcept {
+            // don't deallocate, just put it into list of datapool.
+            mData->~value_type();
+            map.deallocate(mData);
+        }
+
+        void destroyDoNotDeallocate() noexcept {
+            mData->~value_type();
+        }
+
+        value_type const* operator->() const noexcept {
+            return mData;
+        }
+
+        value_type* operator->() noexcept {
+            return mData;
+        }
+
+        const value_type& operator*() const {
+            return *mData;
+        }
+
+        value_type& operator*() {
+            return *mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type&>::type getFirst() noexcept {
+            return mData->first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT&>::type getFirst() noexcept {
+            return *mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type const&>::type
+            getFirst() const noexcept {
+            return mData->first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT const&>::type getFirst() const noexcept {
+            return *mData;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, MT&>::type getSecond() noexcept {
+            return mData->second;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, MT const&>::type getSecond() const noexcept {
+            return mData->second;
+        }
+
+        void swap(DataNode<M, false>& o) noexcept {
+            using std::swap;
+            swap(mData, o.mData);
+        }
+
+    private:
+        value_type* mData;
+    };
+
+    using Node = DataNode<Self, IsFlat>;
+
+    // helpers for insertKeyPrepareEmptySpot: extract first entry (only const required)
+    ROBIN_HOOD(NODISCARD) key_type const& getFirstConst(Node const& n) const noexcept {
+        return n.getFirst();
+    }
+
+    // in case we have void mapped_type, we are not using a pair, thus we just route k through.
+    // No need to disable this because it's just not used if not applicable.
+    ROBIN_HOOD(NODISCARD) key_type const& getFirstConst(key_type const& k) const noexcept {
+        return k;
+    }
+
+    // in case we have non-void mapped_type, we have a standard robin_hood::pair
+    template <typename Q = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<!std::is_void<Q>::value, key_type const&>::type
+        getFirstConst(value_type const& vt) const noexcept {
+        return vt.first;
+    }
+
+    // Cloner //////////////////////////////////////////////////////////
+
+    template <typename M, bool UseMemcpy>
+    struct Cloner;
+
+    // fast path: Just copy data, without allocating anything.
+    template <typename M>
+    struct Cloner<M, true> {
+        void operator()(M const& source, M& target) const {
+            auto const* const src = reinterpret_cast<char const*>(source.mKeyVals);
+            auto* tgt = reinterpret_cast<char*>(target.mKeyVals);
+            auto const numElementsWithBuffer = target.calcNumElementsWithBuffer(target.mMask + 1);
+            std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer), tgt);
+        }
+    };
+
+    template <typename M>
+    struct Cloner<M, false> {
+        void operator()(M const& s, M& t) const {
+            auto const numElementsWithBuffer = t.calcNumElementsWithBuffer(t.mMask + 1);
+            std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer), t.mInfo);
+
+            for (size_t i = 0; i < numElementsWithBuffer; ++i) {
+                if (t.mInfo[i]) {
+                    ::new (static_cast<void*>(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]);
+                }
+            }
+        }
+    };
+
+    // Destroyer ///////////////////////////////////////////////////////
+
+    template <typename M, bool IsFlatAndTrivial>
+    struct Destroyer {};
+
+    template <typename M>
+    struct Destroyer<M, true> {
+        void nodes(M& m) const noexcept {
+            m.mNumElements = 0;
+        }
+
+        void nodesDoNotDeallocate(M& m) const noexcept {
+            m.mNumElements = 0;
+        }
+    };
+
+    template <typename M>
+    struct Destroyer<M, false> {
+        void nodes(M& m) const noexcept {
+            m.mNumElements = 0;
+            // clear also resets mInfo to 0, that's sometimes not necessary.
+            auto const numElementsWithBuffer = m.calcNumElementsWithBuffer(m.mMask + 1);
+
+            for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+                if (0 != m.mInfo[idx]) {
+                    Node& n = m.mKeyVals[idx];
+                    n.destroy(m);
+                    n.~Node();
+                }
+            }
+        }
+
+        void nodesDoNotDeallocate(M& m) const noexcept {
+            m.mNumElements = 0;
+            // clear also resets mInfo to 0, that's sometimes not necessary.
+            auto const numElementsWithBuffer = m.calcNumElementsWithBuffer(m.mMask + 1);
+            for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+                if (0 != m.mInfo[idx]) {
+                    Node& n = m.mKeyVals[idx];
+                    n.destroyDoNotDeallocate();
+                    n.~Node();
+                }
+            }
+        }
+    };
+
+    // Iter ////////////////////////////////////////////////////////////
+
+    struct fast_forward_tag {};
+
+    // generic iterator for both const_iterator and iterator.
+    template <bool IsConst>
+    // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions)
+    class Iter {
+    private:
+        using NodePtr = typename std::conditional<IsConst, Node const*, Node*>::type;
+
+    public:
+        using difference_type = std::ptrdiff_t;
+        using value_type = typename Self::value_type;
+        using reference = typename std::conditional<IsConst, value_type const&, value_type&>::type;
+        using pointer = typename std::conditional<IsConst, value_type const*, value_type*>::type;
+        using iterator_category = std::forward_iterator_tag;
+
+        // default constructed iterator can be compared to itself, but WON'T return true when
+        // compared to end().
+        Iter() = default;
+
+        // Rule of zero: nothing specified. The conversion constructor is only enabled for
+        // iterator to const_iterator, so it doesn't accidentally work as a copy ctor.
+
+        // Conversion constructor from iterator to const_iterator.
+        template <bool OtherIsConst,
+                  typename = typename std::enable_if<IsConst && !OtherIsConst>::type>
+        // NOLINTNEXTLINE(hicpp-explicit-conversions)
+        Iter(Iter<OtherIsConst> const& other) noexcept
+            : mKeyVals(other.mKeyVals)
+            , mInfo(other.mInfo) {}
+
+        Iter(NodePtr valPtr, uint8_t const* infoPtr) noexcept
+            : mKeyVals(valPtr)
+            , mInfo(infoPtr) {}
+
+        Iter(NodePtr valPtr, uint8_t const* infoPtr,
+             fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept
+            : mKeyVals(valPtr)
+            , mInfo(infoPtr) {
+            fastForward();
+        }
+
+        template <bool OtherIsConst,
+                  typename = typename std::enable_if<IsConst && !OtherIsConst>::type>
+        Iter& operator=(Iter<OtherIsConst> const& other) noexcept {
+            mKeyVals = other.mKeyVals;
+            mInfo = other.mInfo;
+            return *this;
+        }
+
+        // prefix increment. Undefined behavior if we are at end()!
+        Iter& operator++() noexcept {
+            mInfo++;
+            mKeyVals++;
+            fastForward();
+            return *this;
+        }
+
+        Iter operator++(int) noexcept {
+            Iter tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+
+        reference operator*() const {
+            return **mKeyVals;
+        }
+
+        pointer operator->() const {
+            return &**mKeyVals;
+        }
+
+        template <bool O>
+        bool operator==(Iter<O> const& o) const noexcept {
+            return mKeyVals == o.mKeyVals;
+        }
+
+        template <bool O>
+        bool operator!=(Iter<O> const& o) const noexcept {
+            return mKeyVals != o.mKeyVals;
+        }
+
+    private:
+        // fast forward to the next non-free info byte
+        // I've tried a few variants that don't depend on intrinsics, but unfortunately they are
+        // quite a bit slower than this one. So I've reverted that change again. See map_benchmark.
+        void fastForward() noexcept {
+            size_t n = 0;
+            while (0U == (n = detail::unaligned_load<size_t>(mInfo))) {
+                mInfo += sizeof(size_t);
+                mKeyVals += sizeof(size_t);
+            }
+#if defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+            // we know for certain that within the next 8 bytes we'll find a non-zero one.
+            if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint32_t>(mInfo))) {
+                mInfo += 4;
+                mKeyVals += 4;
+            }
+            if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint16_t>(mInfo))) {
+                mInfo += 2;
+                mKeyVals += 2;
+            }
+            if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) {
+                mInfo += 1;
+                mKeyVals += 1;
+            }
+#else
+#    if ROBIN_HOOD(LITTLE_ENDIAN)
+            auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+#    else
+            auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#    endif
+            mInfo += inc;
+            mKeyVals += inc;
+#endif
+        }
+
+        friend class Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+        NodePtr mKeyVals{nullptr};
+        uint8_t const* mInfo{nullptr};
+    };
+
+    ////////////////////////////////////////////////////////////////////
+
+    // highly performance relevant code.
+    // Lower bits are used for indexing into the array (2^n size)
+    // The upper 1-5 bits need to be a reasonable good hash, to save comparisons.
+    template <typename HashKey>
+    void keyToIdx(HashKey&& key, size_t* idx, InfoType* info) const {
+        // In addition to whatever hash is used, add another mul & shift so we get better hashing.
+        // This serves as a bad hash prevention, if the given data is
+        // badly mixed.
+        auto h = static_cast<uint64_t>(WHash::operator()(key));
+
+        h *= mHashMultiplier;
+        h ^= h >> 33U;
+
+        // the lower InitialInfoNumBits are reserved for info.
+        *info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
+        *idx = (static_cast<size_t>(h) >> InitialInfoNumBits) & mMask;
+    }
+
+    // forwards the index by one, wrapping around at the end
+    void next(InfoType* info, size_t* idx) const noexcept {
+        *idx = *idx + 1;
+        *info += mInfoInc;
+    }
+
+    void nextWhileLess(InfoType* info, size_t* idx) const noexcept {
+        // unrolling this by hand did not bring any speedups.
+        while (*info < mInfo[*idx]) {
+            next(info, idx);
+        }
+    }
+
+    // Shift everything up by one element. Tries to move stuff around.
+    void
+    shiftUp(size_t startIdx,
+            size_t const insertion_idx) noexcept(std::is_nothrow_move_assignable<Node>::value) {
+        auto idx = startIdx;
+        ::new (static_cast<void*>(mKeyVals + idx)) Node(std::move(mKeyVals[idx - 1]));
+        while (--idx != insertion_idx) {
+            mKeyVals[idx] = std::move(mKeyVals[idx - 1]);
+        }
+
+        idx = startIdx;
+        while (idx != insertion_idx) {
+            ROBIN_HOOD_COUNT(shiftUp)
+            mInfo[idx] = static_cast<uint8_t>(mInfo[idx - 1] + mInfoInc);
+            if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) {
+                mMaxNumElementsAllowed = 0;
+            }
+            --idx;
+        }
+    }
+
+    void shiftDown(size_t idx) noexcept(std::is_nothrow_move_assignable<Node>::value) {
+        // until we find one that is either empty or has zero offset.
+        // TODO(martinus) we don't need to move everything, just the last one for the same
+        // bucket.
+        mKeyVals[idx].destroy(*this);
+
+        // until we find one that is either empty or has zero offset.
+        while (mInfo[idx + 1] >= 2 * mInfoInc) {
+            ROBIN_HOOD_COUNT(shiftDown)
+            mInfo[idx] = static_cast<uint8_t>(mInfo[idx + 1] - mInfoInc);
+            mKeyVals[idx] = std::move(mKeyVals[idx + 1]);
+            ++idx;
+        }
+
+        mInfo[idx] = 0;
+        // don't destroy, we've moved it
+        // mKeyVals[idx].destroy(*this);
+        mKeyVals[idx].~Node();
+    }
+
+    // copy of find(), except that it returns iterator instead of const_iterator.
+    template <typename Other>
+    ROBIN_HOOD(NODISCARD)
+    size_t findIdx(Other const& key) const {
+        size_t idx{};
+        InfoType info{};
+        keyToIdx(key, &idx, &info);
+
+        do {
+            // unrolling this twice gives a bit of a speedup. More unrolling did not help.
+            if (info == mInfo[idx] &&
+                ROBIN_HOOD_LIKELY(WKeyEqual::operator()(key, mKeyVals[idx].getFirst()))) {
+                return idx;
+            }
+            next(&info, &idx);
+            if (info == mInfo[idx] &&
+                ROBIN_HOOD_LIKELY(WKeyEqual::operator()(key, mKeyVals[idx].getFirst()))) {
+                return idx;
+            }
+            next(&info, &idx);
+        } while (info <= mInfo[idx]);
+
+        // nothing found!
+        return mMask == 0 ? 0
+                          : static_cast<size_t>(std::distance(
+                                mKeyVals, reinterpret_cast_no_cast_align_warning<Node*>(mInfo)));
+    }
+
+    void cloneData(const Table& o) {
+        Cloner<Table, IsFlat && ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(Node)>()(o, *this);
+    }
+
+    // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is resized.
+    // @return True on success, false if something went wrong
+    void insert_move(Node&& keyval) {
+        // we don't retry, fail if overflowing
+        // don't need to check max num elements
+        if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
+            throwOverflowError();
+        }
+
+        size_t idx{};
+        InfoType info{};
+        keyToIdx(keyval.getFirst(), &idx, &info);
+
+        // skip forward. Use <= because we are certain that the element is not there.
+        while (info <= mInfo[idx]) {
+            idx = idx + 1;
+            info += mInfoInc;
+        }
+
+        // key not found, so we are now exactly where we want to insert it.
+        auto const insertion_idx = idx;
+        auto const insertion_info = static_cast<uint8_t>(info);
+        if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+            mMaxNumElementsAllowed = 0;
+        }
+
+        // find an empty spot
+        while (0 != mInfo[idx]) {
+            next(&info, &idx);
+        }
+
+        auto& l = mKeyVals[insertion_idx];
+        if (idx == insertion_idx) {
+            ::new (static_cast<void*>(&l)) Node(std::move(keyval));
+        } else {
+            shiftUp(idx, insertion_idx);
+            l = std::move(keyval);
+        }
+
+        // put at empty spot
+        mInfo[insertion_idx] = insertion_info;
+
+        ++mNumElements;
+    }
+
+public:
+    using iterator = Iter<false>;
+    using const_iterator = Iter<true>;
+
+    Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual()))
+        : WHash()
+        , WKeyEqual() {
+        ROBIN_HOOD_TRACE(this)
+    }
+
+    // Creates an empty hash map. Nothing is allocated yet, this happens at the first insert.
+    // This tremendously speeds up ctor & dtor of a map that never receives an element. The
+    // penalty is payed at the first insert, and not before. Lookup of this empty map works
+    // because everybody points to DummyInfoByte::b. parameter bucket_count is dictated by the
+    // standard, but we can ignore it.
+    explicit Table(
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash& h = Hash{},
+        const KeyEqual& equal = KeyEqual{}) noexcept(noexcept(Hash(h)) && noexcept(KeyEqual(equal)))
+        : WHash(h)
+        , WKeyEqual(equal) {
+        ROBIN_HOOD_TRACE(this)
+    }
+
+    template <typename Iter>
+    Table(Iter first, Iter last, size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+          const Hash& h = Hash{}, const KeyEqual& equal = KeyEqual{})
+        : WHash(h)
+        , WKeyEqual(equal) {
+        ROBIN_HOOD_TRACE(this)
+        insert(first, last);
+    }
+
+    Table(std::initializer_list<value_type> initlist,
+          size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, const Hash& h = Hash{},
+          const KeyEqual& equal = KeyEqual{})
+        : WHash(h)
+        , WKeyEqual(equal) {
+        ROBIN_HOOD_TRACE(this)
+        insert(initlist.begin(), initlist.end());
+    }
+
+    Table(Table&& o) noexcept
+        : WHash(std::move(static_cast<WHash&>(o)))
+        , WKeyEqual(std::move(static_cast<WKeyEqual&>(o)))
+        , DataPool(std::move(static_cast<DataPool&>(o))) {
+        ROBIN_HOOD_TRACE(this)
+        if (o.mMask) {
+            mHashMultiplier = std::move(o.mHashMultiplier);
+            mKeyVals = std::move(o.mKeyVals);
+            mInfo = std::move(o.mInfo);
+            mNumElements = std::move(o.mNumElements);
+            mMask = std::move(o.mMask);
+            mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+            mInfoInc = std::move(o.mInfoInc);
+            mInfoHashShift = std::move(o.mInfoHashShift);
+            // set other's mask to 0 so its destructor won't do anything
+            o.init();
+        }
+    }
+
+    Table& operator=(Table&& o) noexcept {
+        ROBIN_HOOD_TRACE(this)
+        if (&o != this) {
+            if (o.mMask) {
+                // only move stuff if the other map actually has some data
+                destroy();
+                mHashMultiplier = std::move(o.mHashMultiplier);
+                mKeyVals = std::move(o.mKeyVals);
+                mInfo = std::move(o.mInfo);
+                mNumElements = std::move(o.mNumElements);
+                mMask = std::move(o.mMask);
+                mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+                mInfoInc = std::move(o.mInfoInc);
+                mInfoHashShift = std::move(o.mInfoHashShift);
+                WHash::operator=(std::move(static_cast<WHash&>(o)));
+                WKeyEqual::operator=(std::move(static_cast<WKeyEqual&>(o)));
+                DataPool::operator=(std::move(static_cast<DataPool&>(o)));
+
+                o.init();
+
+            } else {
+                // nothing in the other map => just clear us.
+                clear();
+            }
+        }
+        return *this;
+    }
+
+    Table(const Table& o)
+        : WHash(static_cast<const WHash&>(o))
+        , WKeyEqual(static_cast<const WKeyEqual&>(o))
+        , DataPool(static_cast<const DataPool&>(o)) {
+        ROBIN_HOOD_TRACE(this)
+        if (!o.empty()) {
+            // not empty: create an exact copy. it is also possible to just iterate through all
+            // elements and insert them, but copying is probably faster.
+
+            auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+            auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+
+            ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                          << numElementsWithBuffer << ")")
+            mHashMultiplier = o.mHashMultiplier;
+            mKeyVals = static_cast<Node*>(
+                detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+            // no need for calloc because clonData does memcpy
+            mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+            mNumElements = o.mNumElements;
+            mMask = o.mMask;
+            mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+            mInfoInc = o.mInfoInc;
+            mInfoHashShift = o.mInfoHashShift;
+            cloneData(o);
+        }
+    }
+
+    // Creates a copy of the given map. Copy constructor of each entry is used.
+    // Not sure why clang-tidy thinks this doesn't handle self assignment, it does
+    // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+    Table& operator=(Table const& o) {
+        ROBIN_HOOD_TRACE(this)
+        if (&o == this) {
+            // prevent assigning of itself
+            return *this;
+        }
+
+        // we keep using the old allocator and not assign the new one, because we want to keep
+        // the memory available. when it is the same size.
+        if (o.empty()) {
+            if (0 == mMask) {
+                // nothing to do, we are empty too
+                return *this;
+            }
+
+            // not empty: destroy what we have there
+            // clear also resets mInfo to 0, that's sometimes not necessary.
+            destroy();
+            init();
+            WHash::operator=(static_cast<const WHash&>(o));
+            WKeyEqual::operator=(static_cast<const WKeyEqual&>(o));
+            DataPool::operator=(static_cast<DataPool const&>(o));
+
+            return *this;
+        }
+
+        // clean up old stuff
+        Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}.nodes(*this);
+
+        if (mMask != o.mMask) {
+            // no luck: we don't have the same array size allocated, so we need to realloc.
+            if (0 != mMask) {
+                // only deallocate if we actually have data!
+                ROBIN_HOOD_LOG("std::free")
+                std::free(mKeyVals);
+            }
+
+            auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+            auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+            ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                          << numElementsWithBuffer << ")")
+            mKeyVals = static_cast<Node*>(
+                detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+
+            // no need for calloc here because cloneData performs a memcpy.
+            mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+            // sentinel is set in cloneData
+        }
+        WHash::operator=(static_cast<const WHash&>(o));
+        WKeyEqual::operator=(static_cast<const WKeyEqual&>(o));
+        DataPool::operator=(static_cast<DataPool const&>(o));
+        mHashMultiplier = o.mHashMultiplier;
+        mNumElements = o.mNumElements;
+        mMask = o.mMask;
+        mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+        mInfoInc = o.mInfoInc;
+        mInfoHashShift = o.mInfoHashShift;
+        cloneData(o);
+
+        return *this;
+    }
+
+    // Swaps everything between the two maps.
+    void swap(Table& o) {
+        ROBIN_HOOD_TRACE(this)
+        using std::swap;
+        swap(o, *this);
+    }
+
+    // Clears all data, without resizing.
+    void clear() {
+        ROBIN_HOOD_TRACE(this)
+        if (empty()) {
+            // don't do anything! also important because we don't want to write to
+            // DummyInfoByte::b, even though we would just write 0 to it.
+            return;
+        }
+
+        Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}.nodes(*this);
+
+        auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+        // clear everything, then set the sentinel again
+        uint8_t const z = 0;
+        std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z);
+        mInfo[numElementsWithBuffer] = 1;
+
+        mInfoInc = InitialInfoInc;
+        mInfoHashShift = InitialInfoHashShift;
+    }
+
+    // Destroys the map and all it's contents.
+    ~Table() {
+        ROBIN_HOOD_TRACE(this)
+        destroy();
+    }
+
+    // Checks if both tables contain the same entries. Order is irrelevant.
+    bool operator==(const Table& other) const {
+        ROBIN_HOOD_TRACE(this)
+        if (other.size() != size()) {
+            return false;
+        }
+        for (auto const& otherEntry : other) {
+            if (!has(otherEntry)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(const Table& other) const {
+        ROBIN_HOOD_TRACE(this)
+        return !operator==(other);
+    }
+
+    template <typename Q = mapped_type>
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type operator[](const key_type& key) {
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first]))
+                Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                     std::forward_as_tuple());
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = Node(*this, std::piecewise_construct,
+                                               std::forward_as_tuple(key), std::forward_as_tuple());
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+        }
+
+        return mKeyVals[idxAndState.first].getSecond();
+    }
+
+    template <typename Q = mapped_type>
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type operator[](key_type&& key) {
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first]))
+                Node(*this, std::piecewise_construct, std::forward_as_tuple(std::move(key)),
+                     std::forward_as_tuple());
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] =
+                Node(*this, std::piecewise_construct, std::forward_as_tuple(std::move(key)),
+                     std::forward_as_tuple());
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+        }
+
+        return mKeyVals[idxAndState.first].getSecond();
+    }
+
+    template <typename Iter>
+    void insert(Iter first, Iter last) {
+        for (; first != last; ++first) {
+            // value_type ctor needed because this might be called with std::pair's
+            insert(value_type(*first));
+        }
+    }
+
+    void insert(std::initializer_list<value_type> ilist) {
+        for (auto&& vt : ilist) {
+            insert(std::move(vt));
+        }
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) {
+        ROBIN_HOOD_TRACE(this)
+        Node n{*this, std::forward<Args>(args)...};
+        auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n));
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            n.destroy(*this);
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first])) Node(*this, std::move(n));
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = std::move(n);
+            break;
+
+        case InsertionState::overflow_error:
+            n.destroy(*this);
+            throwOverflowError();
+            break;
+        }
+
+        return std::make_pair(iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+                              InsertionState::key_found != idxAndState.second);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const key_type& key, Args&&... args) {
+        return try_emplace_impl(key, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(key_type&& key, Args&&... args) {
+        return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const_iterator hint, const key_type& key,
+                                          Args&&... args) {
+        (void)hint;
+        return try_emplace_impl(key, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const_iterator hint, key_type&& key, Args&&... args) {
+        (void)hint;
+        return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const key_type& key, Mapped&& obj) {
+        return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(key_type&& key, Mapped&& obj) {
+        return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const_iterator hint, const key_type& key,
+                                               Mapped&& obj) {
+        (void)hint;
+        return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const_iterator hint, key_type&& key, Mapped&& obj) {
+        (void)hint;
+        return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+    }
+
+    std::pair<iterator, bool> insert(const value_type& keyval) {
+        ROBIN_HOOD_TRACE(this)
+        return emplace(keyval);
+    }
+
+    std::pair<iterator, bool> insert(value_type&& keyval) {
+        return emplace(std::move(keyval));
+    }
+
+    // Returns 1 if key is found, 0 otherwise.
+    size_t count(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv != reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            return 1;
+        }
+        return 0;
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<Self_::is_transparent, size_t>::type count(const OtherKey& key) const {
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv != reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            return 1;
+        }
+        return 0;
+    }
+
+    bool contains(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
+        return 1U == count(key);
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<Self_::is_transparent, bool>::type contains(const OtherKey& key) const {
+        return 1U == count(key);
+    }
+
+    // Returns a reference to the value found for key.
+    // Throws std::out_of_range if element cannot be found
+    template <typename Q = mapped_type>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type at(key_type const& key) {
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv == reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            doThrow<std::out_of_range>("key not found");
+        }
+        return kv->getSecond();
+    }
+
+    // Returns a reference to the value found for key.
+    // Throws std::out_of_range if element cannot be found
+    template <typename Q = mapped_type>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<!std::is_void<Q>::value, Q const&>::type at(key_type const& key) const {
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv == reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            doThrow<std::out_of_range>("key not found");
+        }
+        return kv->getSecond();
+    }
+
+    const_iterator find(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey>
+    const_iterator find(const OtherKey& key, is_transparent_tag /*unused*/) const {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    typename std::enable_if<Self_::is_transparent, // NOLINT(modernize-use-nodiscard)
+                            const_iterator>::type  // NOLINT(modernize-use-nodiscard)
+    find(const OtherKey& key) const {              // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    iterator find(const key_type& key) {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey>
+    iterator find(const OtherKey& key, is_transparent_tag /*unused*/) {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    typename std::enable_if<Self_::is_transparent, iterator>::type find(const OtherKey& key) {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    iterator begin() {
+        ROBIN_HOOD_TRACE(this)
+        if (empty()) {
+            return end();
+        }
+        return iterator(mKeyVals, mInfo, fast_forward_tag{});
+    }
+    const_iterator begin() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return cbegin();
+    }
+    const_iterator cbegin() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        if (empty()) {
+            return cend();
+        }
+        return const_iterator(mKeyVals, mInfo, fast_forward_tag{});
+    }
+
+    iterator end() {
+        ROBIN_HOOD_TRACE(this)
+        // no need to supply valid info pointer: end() must not be dereferenced, and only node
+        // pointer is compared.
+        return iterator{reinterpret_cast_no_cast_align_warning<Node*>(mInfo), nullptr};
+    }
+    const_iterator end() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return cend();
+    }
+    const_iterator cend() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return const_iterator{reinterpret_cast_no_cast_align_warning<Node*>(mInfo), nullptr};
+    }
+
+    iterator erase(const_iterator pos) {
+        ROBIN_HOOD_TRACE(this)
+        // its safe to perform const cast here
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        return erase(iterator{const_cast<Node*>(pos.mKeyVals), const_cast<uint8_t*>(pos.mInfo)});
+    }
+
+    // Erases element at pos, returns iterator to the next element.
+    iterator erase(iterator pos) {
+        ROBIN_HOOD_TRACE(this)
+        // we assume that pos always points to a valid entry, and not end().
+        auto const idx = static_cast<size_t>(pos.mKeyVals - mKeyVals);
+
+        shiftDown(idx);
+        --mNumElements;
+
+        if (*pos.mInfo) {
+            // we've backward shifted, return this again
+            return pos;
+        }
+
+        // no backward shift, return next element
+        return ++pos;
+    }
+
+    size_t erase(const key_type& key) {
+        ROBIN_HOOD_TRACE(this)
+        size_t idx{};
+        InfoType info{};
+        keyToIdx(key, &idx, &info);
+
+        // check while info matches with the source idx
+        do {
+            if (info == mInfo[idx] && WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+                shiftDown(idx);
+                --mNumElements;
+                return 1;
+            }
+            next(&info, &idx);
+        } while (info <= mInfo[idx]);
+
+        // nothing found to delete
+        return 0;
+    }
+
+    // reserves space for the specified number of elements. Makes sure the old data fits.
+    // exactly the same as reserve(c).
+    void rehash(size_t c) {
+        // forces a reserve
+        reserve(c, true);
+    }
+
+    // reserves space for the specified number of elements. Makes sure the old data fits.
+    // Exactly the same as rehash(c). Use rehash(0) to shrink to fit.
+    void reserve(size_t c) {
+        // reserve, but don't force rehash
+        reserve(c, false);
+    }
+
+    // If possible reallocates the map to a smaller one. This frees the underlying table.
+    // Does not do anything if load_factor is too large for decreasing the table's size.
+    void compact() {
+        ROBIN_HOOD_TRACE(this)
+        auto newSize = InitialNumElements;
+        while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) {
+            newSize *= 2;
+        }
+        if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+            throwOverflowError();
+        }
+
+        ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask << " + 1")
+
+        // only actually do anything when the new size is bigger than the old one. This prevents to
+        // continuously allocate for each reserve() call.
+        if (newSize < mMask + 1) {
+            rehashPowerOfTwo(newSize, true);
+        }
+    }
+
+    size_type size() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return mNumElements;
+    }
+
+    size_type max_size() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return static_cast<size_type>(-1);
+    }
+
+    ROBIN_HOOD(NODISCARD) bool empty() const noexcept {
+        ROBIN_HOOD_TRACE(this)
+        return 0 == mNumElements;
+    }
+
+    float max_load_factor() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return MaxLoadFactor100 / 100.0F;
+    }
+
+    // Average number of elements per bucket. Since we allow only 1 per bucket
+    float load_factor() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return static_cast<float>(size()) / static_cast<float>(mMask + 1);
+    }
+
+    ROBIN_HOOD(NODISCARD) size_t mask() const noexcept {
+        ROBIN_HOOD_TRACE(this)
+        return mMask;
+    }
+
+    ROBIN_HOOD(NODISCARD) size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept {
+        if (ROBIN_HOOD_LIKELY(maxElements <= (std::numeric_limits<size_t>::max)() / 100)) {
+            return maxElements * MaxLoadFactor100 / 100;
+        }
+
+        // we might be a bit inprecise, but since maxElements is quite large that doesn't matter
+        return (maxElements / 100) * MaxLoadFactor100;
+    }
+
+    ROBIN_HOOD(NODISCARD) size_t calcNumBytesInfo(size_t numElements) const noexcept {
+        // we add a uint64_t, which houses the sentinel (first byte) and padding so we can load
+        // 64bit types.
+        return numElements + sizeof(uint64_t);
+    }
+
+    ROBIN_HOOD(NODISCARD)
+    size_t calcNumElementsWithBuffer(size_t numElements) const noexcept {
+        auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements);
+        return numElements + (std::min)(maxNumElementsAllowed, (static_cast<size_t>(0xFF)));
+    }
+
+    // calculation only allowed for 2^n values
+    ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const {
+#if ROBIN_HOOD(BITNESS) == 64
+        return numElements * sizeof(Node) + calcNumBytesInfo(numElements);
+#else
+        // make sure we're doing 64bit operations, so we are at least safe against 32bit overflows.
+        auto const ne = static_cast<uint64_t>(numElements);
+        auto const s = static_cast<uint64_t>(sizeof(Node));
+        auto const infos = static_cast<uint64_t>(calcNumBytesInfo(numElements));
+
+        auto const total64 = ne * s + infos;
+        auto const total = static_cast<size_t>(total64);
+
+        if (ROBIN_HOOD_UNLIKELY(static_cast<uint64_t>(total) != total64)) {
+            throwOverflowError();
+        }
+        return total;
+#endif
+    }
+
+private:
+    template <typename Q = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<!std::is_void<Q>::value, bool>::type has(const value_type& e) const {
+        ROBIN_HOOD_TRACE(this)
+        auto it = find(e.first);
+        return it != end() && it->second == e.second;
+    }
+
+    template <typename Q = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<std::is_void<Q>::value, bool>::type has(const value_type& e) const {
+        ROBIN_HOOD_TRACE(this)
+        return find(e) != end();
+    }
+
+    void reserve(size_t c, bool forceRehash) {
+        ROBIN_HOOD_TRACE(this)
+        auto const minElementsAllowed = (std::max)(c, mNumElements);
+        auto newSize = InitialNumElements;
+        while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && newSize != 0) {
+            newSize *= 2;
+        }
+        if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+            throwOverflowError();
+        }
+
+        ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask << " + 1")
+
+        // only actually do anything when the new size is bigger than the old one. This prevents to
+        // continuously allocate for each reserve() call.
+        if (forceRehash || newSize > mMask + 1) {
+            rehashPowerOfTwo(newSize, false);
+        }
+    }
+
+    // reserves space for at least the specified number of elements.
+    // only works if numBuckets if power of two
+    // True on success, false otherwise
+    void rehashPowerOfTwo(size_t numBuckets, bool forceFree) {
+        ROBIN_HOOD_TRACE(this)
+
+        Node* const oldKeyVals = mKeyVals;
+        uint8_t const* const oldInfo = mInfo;
+
+        const size_t oldMaxElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+        // resize operation: move stuff
+        initData(numBuckets);
+        if (oldMaxElementsWithBuffer > 1) {
+            for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) {
+                if (oldInfo[i] != 0) {
+                    // might throw an exception, which is really bad since we are in the middle of
+                    // moving stuff.
+                    insert_move(std::move(oldKeyVals[i]));
+                    // destroy the node but DON'T destroy the data.
+                    oldKeyVals[i].~Node();
+                }
+            }
+
+            // this check is not necessary as it's guarded by the previous if, but it helps
+            // silence g++'s overeager "attempt to free a non-heap object 'map'
+            // [-Werror=free-nonheap-object]" warning.
+            if (oldKeyVals != reinterpret_cast_no_cast_align_warning<Node*>(&mMask)) {
+                // don't destroy old data: put it into the pool instead
+                if (forceFree) {
+                    std::free(oldKeyVals);
+                } else {
+                    DataPool::addOrFree(oldKeyVals, calcNumBytesTotal(oldMaxElementsWithBuffer));
+                }
+            }
+        }
+    }
+
+    ROBIN_HOOD(NOINLINE) void throwOverflowError() const {
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+        throw std::overflow_error("robin_hood::map overflow");
+#else
+        abort();
+#endif
+    }
+
+    template <typename OtherKey, typename... Args>
+    std::pair<iterator, bool> try_emplace_impl(OtherKey&& key, Args&&... args) {
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first])) Node(
+                *this, std::piecewise_construct, std::forward_as_tuple(std::forward<OtherKey>(key)),
+                std::forward_as_tuple(std::forward<Args>(args)...));
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = Node(*this, std::piecewise_construct,
+                                               std::forward_as_tuple(std::forward<OtherKey>(key)),
+                                               std::forward_as_tuple(std::forward<Args>(args)...));
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+            break;
+        }
+
+        return std::make_pair(iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+                              InsertionState::key_found != idxAndState.second);
+    }
+
+    template <typename OtherKey, typename Mapped>
+    std::pair<iterator, bool> insertOrAssignImpl(OtherKey&& key, Mapped&& obj) {
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            mKeyVals[idxAndState.first].getSecond() = std::forward<Mapped>(obj);
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first])) Node(
+                *this, std::piecewise_construct, std::forward_as_tuple(std::forward<OtherKey>(key)),
+                std::forward_as_tuple(std::forward<Mapped>(obj)));
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = Node(*this, std::piecewise_construct,
+                                               std::forward_as_tuple(std::forward<OtherKey>(key)),
+                                               std::forward_as_tuple(std::forward<Mapped>(obj)));
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+            break;
+        }
+
+        return std::make_pair(iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+                              InsertionState::key_found != idxAndState.second);
+    }
+
+    void initData(size_t max_elements) {
+        mNumElements = 0;
+        mMask = max_elements - 1;
+        mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements);
+
+        auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements);
+
+        // calloc also zeroes everything
+        auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+        ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal("
+                                      << numElementsWithBuffer << ")")
+        mKeyVals = reinterpret_cast<Node*>(
+            detail::assertNotNull<std::bad_alloc>(std::calloc(1, numBytesTotal)));
+        mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+
+        // set sentinel
+        mInfo[numElementsWithBuffer] = 1;
+
+        mInfoInc = InitialInfoInc;
+        mInfoHashShift = InitialInfoHashShift;
+    }
+
+    enum class InsertionState { overflow_error, key_found, new_node, overwrite_node };
+
+    // Finds key, and if not already present prepares a spot where to pot the key & value.
+    // This potentially shifts nodes out of the way, updates mInfo and number of inserted
+    // elements, so the only operation left to do is create/assign a new node at that spot.
+    template <typename OtherKey>
+    std::pair<size_t, InsertionState> insertKeyPrepareEmptySpot(OtherKey&& key) {
+        for (int i = 0; i < 256; ++i) {
+            size_t idx{};
+            InfoType info{};
+            keyToIdx(key, &idx, &info);
+            nextWhileLess(&info, &idx);
+
+            // while we potentially have a match
+            while (info == mInfo[idx]) {
+                if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+                    // key already exists, do NOT insert.
+                    // see http://en.cppreference.com/w/cpp/container/unordered_map/insert
+                    return std::make_pair(idx, InsertionState::key_found);
+                }
+                next(&info, &idx);
+            }
+
+            // unlikely that this evaluates to true
+            if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
+                if (!increase_size()) {
+                    return std::make_pair(size_t(0), InsertionState::overflow_error);
+                }
+                continue;
+            }
+
+            // key not found, so we are now exactly where we want to insert it.
+            auto const insertion_idx = idx;
+            auto const insertion_info = info;
+            if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+                mMaxNumElementsAllowed = 0;
+            }
+
+            // find an empty spot
+            while (0 != mInfo[idx]) {
+                next(&info, &idx);
+            }
+
+            if (idx != insertion_idx) {
+                shiftUp(idx, insertion_idx);
+            }
+            // put at empty spot
+            mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
+            ++mNumElements;
+            return std::make_pair(insertion_idx, idx == insertion_idx
+                                                     ? InsertionState::new_node
+                                                     : InsertionState::overwrite_node);
+        }
+
+        // enough attempts failed, so finally give up.
+        return std::make_pair(size_t(0), InsertionState::overflow_error);
+    }
+
+    bool try_increase_info() {
+        ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
+                                   << ", maxNumElementsAllowed="
+                                   << calcMaxNumElementsAllowed(mMask + 1))
+        if (mInfoInc <= 2) {
+            // need to be > 2 so that shift works (otherwise undefined behavior!)
+            return false;
+        }
+        // we got space left, try to make info smaller
+        mInfoInc = static_cast<uint8_t>(mInfoInc >> 1U);
+
+        // remove one bit of the hash, leaving more space for the distance info.
+        // This is extremely fast because we can operate on 8 bytes at once.
+        ++mInfoHashShift;
+        auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+        for (size_t i = 0; i < numElementsWithBuffer; i += 8) {
+            auto val = unaligned_load<uint64_t>(mInfo + i);
+            val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f);
+            std::memcpy(mInfo + i, &val, sizeof(val));
+        }
+        // update sentinel, which might have been cleared out!
+        mInfo[numElementsWithBuffer] = 1;
+
+        mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+        return true;
+    }
+
+    // True if resize was possible, false otherwise
+    bool increase_size() {
+        // nothing allocated yet? just allocate InitialNumElements
+        if (0 == mMask) {
+            initData(InitialNumElements);
+            return true;
+        }
+
+        auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+        if (mNumElements < maxNumElementsAllowed && try_increase_info()) {
+            return true;
+        }
+
+        ROBIN_HOOD_LOG("mNumElements=" << mNumElements << ", maxNumElementsAllowed="
+                                       << maxNumElementsAllowed << ", load="
+                                       << (static_cast<double>(mNumElements) * 100.0 /
+                                           (static_cast<double>(mMask) + 1)))
+
+        if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) {
+            // we have to resize, even though there would still be plenty of space left!
+            // Try to rehash instead. Delete freed memory so we don't steadyily increase mem in case
+            // we have to rehash a few times
+            nextHashMultiplier();
+            rehashPowerOfTwo(mMask + 1, true);
+        } else {
+            // we've reached the capacity of the map, so the hash seems to work nice. Keep using it.
+            rehashPowerOfTwo((mMask + 1) * 2, false);
+        }
+        return true;
+    }
+
+    void nextHashMultiplier() {
+        // adding an *even* number, so that the multiplier will always stay odd. This is necessary
+        // so that the hash stays a mixing function (and thus doesn't have any information loss).
+        mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54);
+    }
+
+    void destroy() {
+        if (0 == mMask) {
+            // don't deallocate!
+            return;
+        }
+
+        Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+            .nodesDoNotDeallocate(*this);
+
+        // This protection against not deleting mMask shouldn't be needed as it's sufficiently
+        // protected with the 0==mMask check, but I have this anyways because g++ 7 otherwise
+        // reports a compile error: attempt to free a non-heap object 'fm'
+        // [-Werror=free-nonheap-object]
+        if (mKeyVals != reinterpret_cast_no_cast_align_warning<Node*>(&mMask)) {
+            ROBIN_HOOD_LOG("std::free")
+            std::free(mKeyVals);
+        }
+    }
+
+    void init() noexcept {
+        mKeyVals = reinterpret_cast_no_cast_align_warning<Node*>(&mMask);
+        mInfo = reinterpret_cast<uint8_t*>(&mMask);
+        mNumElements = 0;
+        mMask = 0;
+        mMaxNumElementsAllowed = 0;
+        mInfoInc = InitialInfoInc;
+        mInfoHashShift = InitialInfoHashShift;
+    }
+
+    // members are sorted so no padding occurs
+    uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53);                // 8 byte  8
+    Node* mKeyVals = reinterpret_cast_no_cast_align_warning<Node*>(&mMask); // 8 byte 16
+    uint8_t* mInfo = reinterpret_cast<uint8_t*>(&mMask);                    // 8 byte 24
+    size_t mNumElements = 0;                                                // 8 byte 32
+    size_t mMask = 0;                                                       // 8 byte 40
+    size_t mMaxNumElementsAllowed = 0;                                      // 8 byte 48
+    InfoType mInfoInc = InitialInfoInc;                                     // 4 byte 52
+    InfoType mInfoHashShift = InitialInfoHashShift;                         // 4 byte 56
+                                                    // 16 byte 56 if NodeAllocator
+};
+
+} // namespace detail
+
+// map
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_map = detail::Table<true, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_map = detail::Table<false, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_map =
+    detail::Table<sizeof(robin_hood::pair<Key, T>) <= sizeof(size_t) * 6 &&
+                      std::is_nothrow_move_constructible<robin_hood::pair<Key, T>>::value &&
+                      std::is_nothrow_move_assignable<robin_hood::pair<Key, T>>::value,
+                  MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+// set
+
+template <typename Key, typename Hash = hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          size_t MaxLoadFactor100 = 80>
+using unordered_flat_set = detail::Table<true, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          size_t MaxLoadFactor100 = 80>
+using unordered_node_set = detail::Table<false, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          size_t MaxLoadFactor100 = 80>
+using unordered_set = detail::Table<sizeof(Key) <= sizeof(size_t) * 6 &&
+                                        std::is_nothrow_move_constructible<Key>::value &&
+                                        std::is_nothrow_move_assignable<Key>::value,
+                                    MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+} // namespace robin_hood
+
+#endif
diff --git a/tuple.hpp b/tuple.hpp
new file mode 100644
index 0000000..02173e7
--- /dev/null
+++ b/tuple.hpp
@@ -0,0 +1,396 @@
+#pragma once
+
+#include <compare>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+// tuplet concepts and traits
+namespace tuplet {
+template <class T>
+using identity_t = T;
+
+template <size_t I>
+using tag = std::integral_constant<size_t, I>;
+
+template <size_t I>
+constexpr tag<I> tag_v {};
+
+template <size_t N>
+using tag_range = std::make_index_sequence<N>;
+
+template <class T, class U>
+concept same_as = std::is_same_v<T, U>&& std::is_same_v<U, T>;
+
+template <class T, class U>
+concept other_than = !std::is_same_v<std::decay_t<T>, U>;
+
+template <class Tup>
+using base_list_t = typename std::decay_t<Tup>::base_list;
+template <class Tup>
+using element_list_t = typename std::decay_t<Tup>::element_list;
+
+template <class Tuple>
+concept base_list_tuple = requires() {
+    typename std::decay_t<Tuple>::base_list;
+};
+
+template <class T>
+concept stateless = std::is_empty_v<std::decay_t<T>>;
+
+template <class T>
+concept indexable = stateless<T> || requires(T t) {
+    t[tag<0>()];
+};
+
+template <class U, class T>
+concept assignable_to = requires(U u, T t) {
+    t = u;
+};
+
+template <class T>
+concept ordered = requires(T const& t) {
+    {t <=> t};
+};
+template <class T>
+concept equality_comparable = requires(T const& t) {
+    { t == t } -> same_as<bool>;
+};
+} // namespace tuplet
+
+// tuplet::type_list implementation
+// tuplet::type_map implementation
+// tuplet::tuple_elem implementation
+// tuplet::deduce_elems
+// tuplet::tuple declaration (for use in cat2_impl)
+namespace tuplet {
+template <class... T>
+struct tuple;
+
+template <class... T>
+struct type_list {};
+
+template <class... Bases>
+struct type_map : Bases... {
+    using base_list = type_list<Bases...>;
+    using Bases::operator[]...;
+    using Bases::decl_elem...;
+    auto operator<=>(type_map const&) const = default;
+    bool operator==(type_map const&) const = default;
+};
+
+template <size_t I, class T>
+struct tuple_elem {
+    // Like declval, but with the element
+    static T decl_elem(tag<I>);
+    using type = T;
+
+    [[no_unique_address]] T value;
+
+    constexpr decltype(auto) operator[](tag<I>) & { return (value); }
+    constexpr decltype(auto) operator[](tag<I>) const& { return (value); }
+    constexpr decltype(auto) operator[](tag<I>) && {
+        return (std::move(*this).value);
+    }
+    auto operator<=>(tuple_elem const&) const = default;
+    bool operator==(tuple_elem const&) const = default;
+    // Implements comparison for tuples containing reference types
+    constexpr auto operator<=>(tuple_elem const& other) const noexcept(noexcept(
+        value <=> other.value)) requires(std::is_reference_v<T>&& ordered<T>) {
+        return value <=> other.value;
+    }
+    constexpr bool operator==(tuple_elem const& other) const
+        noexcept(noexcept(value == other.value)) requires(
+            std::is_reference_v<T>&& equality_comparable<T>) {
+        return value == other.value;
+    }
+};
+} // namespace tuplet
+
+// tuplet::detail::get_tuple_base implementation
+// tuplet::detail::apply_impl
+// tuplet::detail::cat2_impl
+// tuplet::detail::size_t_from_digits
+namespace tuplet::detail {
+template <class A, class... T>
+struct get_tuple_base;
+
+template <size_t... I, class... T>
+struct get_tuple_base<std::index_sequence<I...>, T...> {
+    using type = type_map<tuple_elem<I, T>...>;
+};
+
+template <class F, class Tup, class... Bases>
+constexpr decltype(auto) apply_impl(F&& f, Tup&& t, type_list<Bases...>) {
+    return std::forward<F>(f)(std::forward<Tup>(t).identity_t<Bases>::value...);
+}
+template <class F, class T, class Tup, class... Bases>
+constexpr decltype(auto) apply_impl2(F&& f, T& other, Tup&& t, type_list<Bases...>) {
+    return std::forward<F>(f)(other, std::forward<Tup>(t).identity_t<Bases>::value...);
+}
+template <class T, class U, class... E1, class... E2, class... B1, class... B2>
+constexpr auto cat2_impl(
+    T&& t1,
+    U&& t2,
+    type_list<E1...>,
+    type_list<E2...>,
+    type_list<B1...>,
+    type_list<B2...>) -> tuple<E1..., E2...> {
+    return {
+        std::forward<T>(t1).identity_t<B1>::value...,
+        std::forward<U>(t2).identity_t<B2>::value...};
+}
+template <char... D>
+constexpr size_t size_t_from_digits() {
+    static_assert((('0' <= D && D <= '9') && ...), "Must be integral literal");
+    size_t num = 0;
+    return ((num = num * 10 + (D - '0')), ..., num);
+}
+} // namespace tuplet::detail
+
+// tuplet::tuple implementation
+namespace tuplet {
+template <class... T>
+using tuple_base_t = typename detail::
+    get_tuple_base<tag_range<sizeof...(T)>, T...>::type;
+
+template <class... T>
+struct tuple : tuple_base_t<T...> {
+    constexpr static size_t N = sizeof...(T);
+    using super = tuple_base_t<T...>;
+    using super::operator[];
+    using base_list = typename super::base_list;
+    using element_list = type_list<T...>;
+    using super::decl_elem;
+
+    template <other_than<tuple> U> // Preserves default assignments
+    constexpr auto& operator=(U&& tup) {
+        using tuple2 = std::decay_t<U>;
+        if (base_list_tuple<tuple2>) {
+            eq_impl(
+                std::forward<U>(tup),
+                base_list(),
+                typename tuple2::base_list());
+        } else {
+            eq_impl(std::forward<U>(tup), tag_range<N>());
+        }
+        return *this;
+    }
+
+    template <assignable_to<T>... U>
+    constexpr auto& assign(U&&... values) {
+        assign_impl(base_list(), std::forward<U>(values)...);
+        return *this;
+    }
+
+    auto operator<=>(tuple const&) const = default;
+    bool operator==(tuple const&) const = default;
+
+   private:
+    template <class U, class... B1, class... B2>
+    constexpr void eq_impl(U&& u, type_list<B1...>, type_list<B2...>) {
+        (void(B1::value = std::forward<U>(u).identity_t<B2>::value), ...);
+    }
+    template <class U, size_t... I>
+    constexpr void eq_impl(U&& u, std::index_sequence<I...>) {
+        (void(tuple_elem<I, T>::value = get<I>(std::forward<U>(u))), ...);
+    }
+    template <class... U, class... B>
+    constexpr void assign_impl(type_list<B...>, U&&... u) {
+        (void(B::value = std::forward<U>(u)), ...);
+    }
+};
+template <>
+struct tuple<> : tuple_base_t<> {
+    constexpr static size_t N = 0;
+    using super = tuple_base_t<>;
+    using base_list = type_list<>;
+    using element_list = type_list<>;
+
+    template <other_than<tuple> U> // Preserves default assignments
+    requires stateless<U>          // Check that U is similarly stateless
+    constexpr auto& operator=(U&& tup) noexcept { return *this; }
+
+    constexpr auto& assign() noexcept { return *this; }
+    auto operator<=>(tuple const&) const = default;
+    bool operator==(tuple const&) const = default;
+};
+template <class... Ts>
+tuple(Ts...) -> tuple<std::unwrap_ref_decay_t<Ts>...>;
+} // namespace tuplet
+
+// tuplet::pair implementation
+namespace tuplet {
+template <class First, class Second>
+struct pair {
+    constexpr static size_t N = 2;
+    [[no_unique_address]] First first;
+    [[no_unique_address]] Second second;
+
+    constexpr decltype(auto) operator[](tag<0>) & { return (first); }
+    constexpr decltype(auto) operator[](tag<0>) const& { return (first); }
+    constexpr decltype(auto) operator[](tag<0>) && {
+        return (std::move(*this).first);
+    }
+    constexpr decltype(auto) operator[](tag<1>) & { return (second); }
+    constexpr decltype(auto) operator[](tag<1>) const& { return (second); }
+    constexpr decltype(auto) operator[](tag<1>) && {
+        return (std::move(*this).second);
+    }
+
+    template <other_than<pair> Type> // Preserves default assignments
+    constexpr auto& operator=(Type&& tup) {
+        auto&& [a, b] = std::forward<Type>(tup);
+        first = std::forward<decltype(a)>(a);
+        second = std::forward<decltype(b)>(b);
+        return *this;
+    }
+
+    template <assignable_to<First> F2, assignable_to<Second> S2>
+    constexpr auto& assign(F2&& f, S2&& s) {
+        first = std::forward<F2>(f);
+        second = std::forward<S2>(s);
+        return *this;
+    }
+    auto operator<=>(pair const&) const = default;
+    bool operator==(pair const&) const = default;
+};
+template <class A, class B>
+pair(A, B) -> pair<std::unwrap_ref_decay_t<A>, std::unwrap_ref_decay_t<B>>;
+} // namespace tuplet
+
+// tuplet::convert implementation
+namespace tuplet {
+// Converts from one tuple type to any other tuple or U
+template <class Tuple>
+struct convert {
+    using base_list = typename std::decay_t<Tuple>::base_list;
+    Tuple tuple;
+    template <class U>
+    constexpr operator U() && {
+        return convert_impl<U>(base_list {});
+    }
+
+   private:
+    template <class U, class... Bases>
+    constexpr U convert_impl(type_list<Bases...>) {
+        return U {std::forward<Tuple>(tuple).identity_t<Bases>::value...};
+    }
+};
+template <class Tuple>
+convert(Tuple&) -> convert<Tuple&>;
+template <class Tuple>
+convert(Tuple const&) -> convert<Tuple const&>;
+template <class Tuple>
+convert(Tuple&&) -> convert<Tuple>;
+} // namespace tuplet
+
+// tuplet::get implementation
+// tuplet::tie implementation
+// tuplet::apply implementation
+namespace tuplet {
+template <size_t I, indexable Tup>
+constexpr decltype(auto) get(Tup&& tup) {
+    return std::forward<Tup>(tup)[tag<I>()];
+}
+
+template <class... T>
+constexpr tuple<T&...> tie(T&... t) {
+    return {t...};
+}
+
+template <class F, base_list_tuple Tup>
+constexpr decltype(auto) apply(F&& func, Tup&& tup) {
+    return detail::apply_impl(
+        std::forward<F>(func),
+        std::forward<Tup>(tup),
+        typename std::decay_t<Tup>::base_list());
+}
+template <class F, class T, base_list_tuple Tup>
+constexpr decltype(auto) apply2(F&& func, T& other, Tup&& tup) {
+    return detail::apply_impl2(
+        std::forward<F>(func),
+        other,
+        std::forward<Tup>(tup),
+        typename std::decay_t<Tup>::base_list());
+}
+template <class F, class A, class B>
+constexpr decltype(auto) apply(F&& func, tuplet::pair<A, B>& pair) {
+    return std::forward<F>(func)(pair.first, pair.second);
+}
+template <class F, class A, class B>
+constexpr decltype(auto) apply(F&& func, tuplet::pair<A, B> const& pair) {
+    return std::forward<F>(func)(pair.first, pair.second);
+}
+template <class F, class A, class B>
+constexpr decltype(auto) apply(F&& func, tuplet::pair<A, B>&& pair) {
+    return std::forward<F>(func)(std::move(pair).first, std::move(pair).second);
+}
+} // namespace tuplet
+
+namespace tuplet {
+constexpr tuple<> tuple_cat() { return {}; }
+template <base_list_tuple T>
+constexpr auto tuple_cat(T&& t) {
+    return std::forward<T>(t);
+}
+template <base_list_tuple T1, base_list_tuple T2>
+constexpr auto tuple_cat(T1&& t1, T2&& t2) {
+    return detail::cat2_impl(
+        std::forward<T1>(t1),
+        std::forward<T2>(t2),
+        element_list_t<T1>(),
+        element_list_t<T2>(),
+        base_list_t<T1>(),
+        base_list_t<T2>());
+}
+template <base_list_tuple T1, base_list_tuple... T2>
+constexpr auto tuple_cat(T1&& t1, T2&&... t2) {
+    return tuplet::tuple_cat(
+        std::forward<T1>(t1),
+        tuplet::tuple_cat(std::forward<T2>(t2)...));
+}
+} // namespace tuplet
+
+// tuplet::make_tuple implementation
+// tuplet::forward_as_tuple implementation
+namespace tuplet {
+template <typename... Ts>
+constexpr auto make_tuple(Ts&&... args) -> tuplet::tuple<std::unwrap_ref_decay_t<Ts>...> {
+    return {std::forward<Ts>(args)...};
+}
+template <typename... T>
+constexpr auto forward_as_tuple(T&&... a) noexcept -> tuple<T&&...> {
+    return {std::forward<T>(a)...};
+}
+} // namespace tuplet
+
+// tuplet literals
+namespace tuplet::literals {
+template <char... D>
+constexpr auto operator""_tag() noexcept
+    -> tag<detail::size_t_from_digits<D...>()> {
+    return {};
+}
+} // namespace tuplet::literals
+
+// std::tuple_size specialization
+// std::tuple_element specialization
+namespace std {
+template <class... T>
+struct tuple_size<tuplet::tuple<T...>>
+  : std::integral_constant<size_t, sizeof...(T)> {};
+
+template <size_t I, class... T>
+struct tuple_element<I, tuplet::tuple<T...>> {
+    using type = decltype(tuplet::tuple<T...>::decl_elem(tuplet::tag<I>()));
+};
+template <class A, class B>
+struct tuple_size<tuplet::pair<A, B>> : std::integral_constant<size_t, 2> {};
+
+template <size_t I, class A, class B>
+struct tuple_element<I, tuplet::pair<A, B>> {
+    static_assert(I < 2, "tuplet::pair only has 2 elements");
+    using type = std::conditional_t<I == 0, A, B>;
+};
+} // namespace std