Skip to content

Commit

Permalink
HPCC-33022 Fix pathological dali load time
Browse files Browse the repository at this point in the history
JPTree (used by Dali SDS) could perform horrendously slowly when
dealing with a large list of child nodes sharing a common
prefix. This was due to how the hashing function was used
(no seed number provided). With many entries having the same
prefix, it caused a high number of hash collisions, resulting
in excessive scanning when adding new members.
This particularly affected the /GeneratedDlls section of Dali.
After fixing the issue, the time to load this section dropped
from over 4700 seconds to 17 seconds.

Also add a 'loadxml' utility to daliadmin, that loads and
times loading an xml file, and traces the amount of memory it
consumes.

Signed-off-by: Jake Smith <jake.smith@lexisnexisrisk.com>
  • Loading branch information
jakesmith committed Dec 11, 2024
1 parent 1caf985 commit b9c6195
Show file tree
Hide file tree
Showing 10 changed files with 118 additions and 28 deletions.
75 changes: 75 additions & 0 deletions dali/daliadmin/daadmin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2472,6 +2472,81 @@ void xmlSize(const char *filename, double pc)
}
}

void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML)
{
OwnedIFile iFile = createIFile(filename);
OwnedIFileIO iFileIO = iFile->open(IFOread);
if (!iFileIO)
{
WARNLOG("File '%s' not found", filename);
return;
}

class CDummyPTreeMaker : public CSimpleInterfaceOf<IPTreeMaker>
{
StringBuffer xpath;
unsigned level = 0;
public:
virtual IPropertyTree *queryRoot() override { return nullptr; }
virtual IPropertyTree *queryCurrentNode() override { return nullptr; }
virtual void reset() override { }
virtual IPropertyTree *create(const char *tag) override { return nullptr; }
// IPTreeNotifyEvent impl.
virtual void beginNode(const char *tag, bool sequence, offset_t startOffset) override { }
virtual void newAttribute(const char *name, const char *value) override { }
virtual void beginNodeContent(const char *tag) override { }
virtual void endNode(const char *tag, unsigned length, const void *value, bool binary, offset_t endOffset) override { }
};

byte flags=ipt_none;
PTreeReaderOptions readFlags=ptr_ignoreWhiteSpace;
Owned<IPTreeMaker> iMaker;
if (!parseOnly)
{
PROGLOG("Creating property tree from file: %s", filename);
byte flags = ipt_none;
if (useLowMemPTree)
{
PROGLOG("Using low memory property trees");
flags = ipt_lowmem;
}
iMaker.setown(createPTreeMaker(flags));
}
else
{
PROGLOG("Reading property tree from file (without creating it): %s", filename);
iMaker.setown(new CDummyPTreeMaker());
}

offset_t fSize = iFileIO->size();
OwnedIFileIOStream stream = createIOStream(iFileIO);
OwnedIFileIOStream progressedIFileIOStream = createProgressIFileIOStream(stream, fSize, "Load progress", 1);
Owned<IPTreeReader> reader = createXMLStreamReader(*progressedIFileIOStream, *iMaker, readFlags);

ProcessInfo memInfo(ReadMemoryInfo);
__uint64 rss = memInfo.getActiveResidentMemory();
CCycleTimer timer;
reader->load();
memInfo.update(ReadMemoryInfo);
__uint64 rssUsed = memInfo.getActiveResidentMemory() - rss;
reader.clear();
progressedIFileIOStream.clear();
PROGLOG("Load took: %.2f - RSS consumed: %.2f MB", (float)timer.elapsedMs()/1000, (float)rssUsed/0x100000);

if (!parseOnly && saveFormattedXML)
{
assertex(iMaker->queryRoot());
StringBuffer outFilename(filename);
outFilename.append(".out.xml");
PROGLOG("Saving to %s", outFilename.str());
timer.reset();
saveXML(outFilename, iMaker->queryRoot(), 2);
PROGLOG("Save took: %.2f", (float)timer.elapsedMs()/1000);
}

::LINK(iMaker->queryRoot()); // intentionally leak (avoid time clearing up)
}

void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType)
{
CDfsLogicalFileName lfn;
Expand Down
1 change: 1 addition & 0 deletions dali/daliadmin/daadmin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ namespace daadmin

extern DALIADMIN_API void setDaliConnectTimeoutMs(unsigned timeoutMs);
extern DALIADMIN_API void xmlSize(const char *filename, double pc);
extern DALIADMIN_API void loadXMLTest(const char *filename, bool parseOnly, bool useLowMemPTree, bool saveFormattedXML);
extern DALIADMIN_API void translateToXpath(const char *logicalfile, DfsXmlBranchKind tailType = DXB_File);

extern DALIADMIN_API void exportToFile(const char *path, const char *filename, bool safe = false);
Expand Down
15 changes: 15 additions & 0 deletions dali/daliadmin/daliadmin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ void usage(const char *exe)
printf(" mpping <server-ip> -- time MP connect\n");
printf(" daliping [ <num> ] -- time dali server connect\n");
printf(" getxref <destxmlfile> -- get all XREF information\n");
printf(" loadxml <srcxmlfile> [--lowmem[=<true|false]] -- use lowmem AtomPTree's\n"
" [--parseonly[=<true|false]] -- parse the xml file, don't load it into dali\n"
" [--savexml[=<true|false]] -- save and time the parsed xml\n");
printf(" dalilocks [ <ip-pattern> ] [ files ] -- get all locked files/xpaths\n");
printf(" unlock <xpath or logicalfile> <[path|file]> -- unlocks either matching xpath(s) or matching logical file(s), can contain wildcards\n");
printf(" validatestore [fix=<true|false>]\n"
Expand Down Expand Up @@ -226,6 +229,18 @@ int main(int argc, const char* argv[])
}
else if (strieq(cmd, "remotetest"))
remoteTest(params.item(1), false);
else if (strieq(cmd, "loadxml"))
{
bool useLowMemPTree = false;
bool saveFormatedTree = false;
bool parseOnly = getComponentConfigSP()->getPropBool("@parseonly");
if (!parseOnly)
{
useLowMemPTree = getComponentConfigSP()->getPropBool("@lowmem");
saveFormatedTree = getComponentConfigSP()->getPropBool("@savexml");
}
loadXMLTest(params.item(1), parseOnly, useLowMemPTree, saveFormatedTree);
}
else
{
UERRLOG("Unknown command %s",cmd);
Expand Down
2 changes: 1 addition & 1 deletion system/jlib/jhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ bool HashTable::keyeq(const void *key1, const void *key2, int ksize) const

unsigned HashTable::hash(const void *key, int ksize) const
{
unsigned h = 0x811C9DC5;
unsigned h = fnvInitialHash32;
unsigned char *bp = (unsigned char *) key;
if (ksize<=0)
{
Expand Down
2 changes: 1 addition & 1 deletion system/jlib/jptree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3847,7 +3847,7 @@ unsigned CAtomPTree::queryHash() const
{
const char *_name = name.get();
size32_t nl = strlen(_name);
return isnocase() ? hashnc((const byte *) _name, nl, 0): hashc((const byte *) _name, nl, 0);
return isnocase() ? hashnc((const byte *) _name, nl, fnvInitialHash32): hashc((const byte *) _name, nl, fnvInitialHash32);
}
}

Expand Down
6 changes: 3 additions & 3 deletions system/jlib/jptree.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ protected:
virtual unsigned getHashFromElement(const void *e) const override;
virtual unsigned getHashFromFindParam(const void *fp) const override
{
return hashcz((const unsigned char *)fp, 0);
return hashcz((const unsigned char *)fp, fnvInitialHash32);
}
virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override
{
Expand Down Expand Up @@ -108,7 +108,7 @@ public:
// SuperHashTable definitions
virtual unsigned getHashFromFindParam(const void *fp) const override
{
return hashncz((const unsigned char *)fp, 0);
return hashncz((const unsigned char *)fp, fnvInitialHash32);
}
virtual bool matchesFindParam(const void *e, const void *fp, unsigned fphash) const override
{
Expand Down Expand Up @@ -844,7 +844,7 @@ public:
const char *myname = queryName();
assert(myname);
size32_t nl = strlen(myname);
return isnocase() ? hashnc((const byte *)myname, nl, 0): hashc((const byte *)myname, nl, 0);
return isnocase() ? hashnc((const byte *)myname, nl, fnvInitialHash32): hashc((const byte *)myname, nl, fnvInitialHash32);
}
virtual void setName(const char *_name) override;
virtual void setAttribute(const char *attr, const char *val, bool encoded) override;
Expand Down
3 changes: 0 additions & 3 deletions system/jlib/jsecrets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
#if defined(__clang__) || defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-nonliteral"

//httplib also generates warning about access outside of array bounds in gcc
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif

#ifdef _USE_OPENSSL
Expand Down
2 changes: 1 addition & 1 deletion system/jlib/jstats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ void StatisticsMapping::createMappings()
}

const unsigned * kinds = indexToKind.getArray();
hashcode = hashc((const byte *)kinds, indexToKind.ordinality() * sizeof(unsigned), 0x811C9DC5);
hashcode = hashc((const byte *)kinds, indexToKind.ordinality() * sizeof(unsigned), fnvInitialHash32);

//All StatisticsMapping objects are assumed to be static, and never destroyed.
const StatisticsMapping * existing = allStatsMappings[hashcode];
Expand Down
22 changes: 11 additions & 11 deletions system/jlib/jsuperhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
//#define MY_TRACE_HASH

#ifdef MY_TRACE_HASH
int my_search_tot = 0;
int my_search_num = 0;
static unsigned my_search_tot = 0;
static unsigned my_search_num = 0;
#endif

//-- SuperHashTable ---------------------------------------------------
Expand Down Expand Up @@ -104,13 +104,13 @@ void SuperHashTable::dumpStats() const
{
#ifdef TRACE_HASH
if (tablecount && search_tot && search_num)
printf("Hash table %d entries, %d size, average search length %d(%d/%d) max %d\n", tablecount, tablesize,
(int) (search_tot/search_num), search_tot, search_num, search_max);
printf("Hash table %u entries, %u size, average search length %" I64F "u(%" I64F "u/%u) max %u\n", tablecount, tablesize,
search_tot/search_num, search_tot, search_num, search_max);
#endif
}

#ifdef TRACE_HASH
void SuperHashTable::note_searchlen(int len) const
void SuperHashTable::note_searchlen(unsigned len) const
{
search_tot += len;
search_num++;
Expand Down Expand Up @@ -151,8 +151,8 @@ unsigned SuperHashTable::doFind(unsigned findHash, const void * findParam) const
}
#ifdef MY_TRACE_HASH
my_search_num++;
if(my_search_num != 0)
printf("Hash table average search length %d\n", (int) (my_search_tot/my_search_num));
if (my_search_num != 0)
printf("Hash table average search length %u\n", my_search_tot/my_search_num);
#endif
#ifdef TRACE_HASH
note_searchlen(searchlen);
Expand Down Expand Up @@ -193,8 +193,8 @@ unsigned SuperHashTable::doFindElement(unsigned v, const void * findET) const
}
#ifdef MY_TRACE_HASH
my_search_num++;
if(my_search_num != 0)
printf("Hash table average search length %d\n", (int) (my_search_tot/my_search_num));
if (my_search_num != 0)
printf("Hash table average search length %u\n", my_search_tot/my_search_num);
#endif
#ifdef TRACE_HASH
note_searchlen(searchlen);
Expand Down Expand Up @@ -233,8 +233,8 @@ unsigned SuperHashTable::doFindNew(unsigned v) const
}
#ifdef MY_TRACE_HASH
my_search_num++;
if(my_search_num != 0)
printf("Hash table average search length %d\n", (int) (my_search_tot/my_search_num));
if (my_search_num != 0)
printf("Hash table average search length %u\n", my_search_tot/my_search_num);
#endif
#ifdef TRACE_HASH
note_searchlen(searchlen);
Expand Down
18 changes: 10 additions & 8 deletions system/jlib/jsuperhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include "jstring.hpp"
#include "jmutex.hpp"

constexpr unsigned fnvInitialHash32 = 0x811C9DC5;

extern jlib_decl unsigned hashc( const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashnc( const unsigned char *k, unsigned length, unsigned initval);
extern jlib_decl unsigned hashcz( const unsigned char *k, unsigned initval);
Expand Down Expand Up @@ -88,7 +90,7 @@ class jlib_decl SuperHashTable : public CInterface
void doKill(void);
void expand();
void expand(unsigned newsize);
void note_searchlen(int) const;
void note_searchlen(unsigned) const;

virtual void onAdd(void *et) = 0;
virtual void onRemove(void *et) = 0;
Expand All @@ -105,9 +107,9 @@ class jlib_decl SuperHashTable : public CInterface
unsigned tablesize;
unsigned tablecount;
#ifdef TRACE_HASH
mutable int search_tot;
mutable int search_num;
mutable int search_max;
mutable unsigned __int64 search_tot;
mutable unsigned search_num;
mutable unsigned search_max;
#endif
};

Expand Down Expand Up @@ -514,9 +516,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOf<HashKeyElement, constchar
HashKeyElement *hke = (HashKeyElement *) checked_malloc(sizeof(HashKeyElement)+l+1,-605);
memcpy((void *) (hke->keyPtr()), key, l+1);
if (nocase)
hke->hashValue = hashnc((const unsigned char *)key, l, 0);
hke->hashValue = hashnc((const unsigned char *)key, l, fnvInitialHash32);
else
hke->hashValue = hashc((const unsigned char *)key, l, 0);
hke->hashValue = hashc((const unsigned char *)key, l, fnvInitialHash32);
hke->linkCount = 0;
return hke;
}
Expand Down Expand Up @@ -609,9 +611,9 @@ class jlib_decl AtomRefTable : public SuperHashTableOf<HashKeyElement, constchar
virtual unsigned getHashFromFindParam(const void *fp) const
{
if (nocase)
return hashncz((const unsigned char *)fp, 0);
return hashncz((const unsigned char *)fp, fnvInitialHash32);
else
return hashcz((const unsigned char *)fp, 0);
return hashcz((const unsigned char *)fp, fnvInitialHash32);
}

virtual const void *getFindParam(const void *e) const
Expand Down

0 comments on commit b9c6195

Please sign in to comment.