I think my situation is not uncommon, but I've been having trouble coming up with a way to make my code efficient with large data sets.
My data set is a list of unique IDs (32-bit unsigned integers, if that matters) in which the ordering of the list is significant (i.e. the calling code expects the IDs to be kept in the order they were inserted).
The calling code is allowed to insert new (unique) IDs into the list at any position and/or remove IDs from the list. The list can grow quite large (e.g. tens of thousands of IDs long)
Because the list is used to populate a QTableView, I need fast (ideally O(1)) lookups between a given unique-ID and its current position in the list, and vice-versa.
My current code stores the list as a std::vector<uint32_t>
, so that looking up the ID at the nth position is just a matter of indexing into the vector. (See int32_t getIDAtRow(size_t rowIdx) const
in the example code below)
For the reverse-lookup, my code generates a std::unordered_map<uint32_t, size_t>
so that given a unique ID as a key, it can quickly return the row-index where that unique ID is currently located. (See ssize_t getRowIndexForID(uint32_t id) const
in the example code below).
This all works fairly well if the data is static; however, if the calling code starts doing a lot insertions or deletions near the beginning of list, each insertion/deletion invalidates most of the items in the reverse-lookup table, so they then have to be regenerated, which can get very slow. I've tried various strategies to mitigate the performance issue (such as delaying the regeneration or doing only a partial regeneration), but none of them are entirely satisfactory -- I can still always find some calling pattern that results in slow performance due to excessive regeneration.
So my question is: is there some other data-structure I can use here to get better performance for my bi-directional ID<->RowIndex
lookups, that can remain efficient even when the IDs list is large and being inserted-into and/or removed-from a lot? i.e. is there something that wouldn't require an O(N) reverse-table-regeneration after an insert or delete?
Compilable toy example code is below, in case it helps explain the problem better.
#include <set>
#include <algorithm>
#include <random>
#include <vector>
#include <unordered_map>
#include <cstdio>
#include <cstdint>
class IDToRowMap
{
public:
IDToRowMap() {/* empty */}
// append a new unique ID to the end of our ordered IDs list
void appendID(uint32_t id)
{
if (getRowIndexForID(id) >= 0)
{
printf("Error, ID %u is already in the table!\n", id);
}
else
{
idsInRowOrder.push_back(id);
idToRowIndex[id] = idsInRowOrder.size()-1;
}
}
// insert a new unique ID into the middle of our ordered IDs list
void insertIDAtRow(uint32_t id, size_t rowIdx)
{
if (getRowIndexForID(id) >= 0)
{
printf("Error, ID %u is already in the table!\n", id);
}
else if (rowIdx < idsInRowOrder.size())
{
idsInRowOrder.insert(idsInRowOrder.begin()+rowIdx, id);
regenerate_reverse_index_starting_at(rowIdx);
}
else
{
appendID(id);
}
}
// remove the specified ID from our list
void removeID(uint32_t id)
{
const ssize_t rowIdx = getRowIndexForID(id);
if (rowIdx >= 0)
{
idsInRowOrder.erase(idsInRowOrder.begin()+rowIdx);
idToRowIndex.erase(id);
regenerate_reverse_index_starting_at(rowIdx);
}
else
{
printf("Error, ID %u is not in the table!\n", id);
}
}
// clear the list
void clear()
{
idsInRowOrder.clear();
idToRowIndex.clear();
}
// Returns the row index that (id) is at, or -1 if (id) isn't in the set
ssize_t getRowIndexForID(uint32_t id) const
{
auto search = idToRowIndex.find(id);
if (search == idToRowIndex.end()) return -1;
return search->second;
}
// Returns the ID at the given row, or -1 if (rowIdx) isn't a valid row index
int32_t getIDAtRow(size_t rowIdx) const
{
return (rowIdx < idsInRowOrder.size()) ? idsInRowOrder[rowIdx] : -1;
}
void print() const
{
for (size_t row=0; row<idsInRowOrder.size(); row++)
{
const uint32_t id = idsInRowOrder[row];
const ssize_t checkRowIndex = getRowIndexForID(id);
const int32_t checkRowID = getIDAtRow(checkRowIndex);
printf("row=%zu id=%u checkRowIndex=%zi checkRowID=%i", row, id, checkRowIndex, checkRowID);
if (checkRowIndex != row) printf(" ERROR:(checkRowIndex doesn't match row)");
if (checkRowID != id) printf(" ERROR:(checkRowID doesn't match id)");
printf("\n");
}
}
private:
// Here's where it gets inefficient -- if I have to insert or remove IDs near the front
// of the list I end up regenerating the reverse index a lot and if the list is long, it gets slow
void regenerate_reverse_index_starting_at(size_t startIdx)
{
printf("Regenerating reverse index from row %zu to row %zu\n", startIdx, idsInRowOrder.size());
for (size_t i=startIdx; i<idsInRowOrder.size(); i++) idToRowIndex[idsInRowOrder[i]] = i;
}
std::vector<uint32_t> idsInRowOrder;
std::unordered_map<uint32_t, size_t> idToRowIndex; // reverse-lookup
};
int main(int, char **)
{
std::vector<uint32_t> someIDs;
{
while(someIDs.size()<50) someIDs.push_back(someIDs.size());
std::random_device rd; // Seed generator
std::mt19937 gen(rd()); // Mersenne Twister engine
std::shuffle(someIDs.begin(), someIDs.end(), gen); // put them in some random order
}
IDToRowMap map;
for (auto a : someIDs) map.appendID(a);
printf("Enter one of the following:\n");
printf(" a 12345 - append ID 12345\n");
printf(" i 12345 - insert ID 12345 at row #3\n");
printf(" r 12345 - remove ID 12345\n");
printf(" c - clear the table\n");
printf(" p - print the table\n");
printf("\n");
while(1)
{
char buf[128];
if (fgets(buf, sizeof(buf), stdin))
{
switch(buf[0])
{
case 'a':
{
const uint32_t id = atol(&buf[2]);
printf("Appending id %u\n", id);
map.appendID(id);
}
break;
case 'i':
{
const uint32_t id = atol(&buf[2]);
printf("Inserting id %u at row #3\n", id);
map.insertIDAtRow(id, 3);
}
break;
case 'r':
{
const uint32_t id = atol(&buf[2]);
printf("Removing id %u\n", id);
map.removeID(id);
}
break;
case 'c':
map.clear();
printf("Cleared!\n");
break;
case 'p':
map.print();
break;
default:
printf("Unknown command [%s]\n", buf);
break;
}
}
}
return 0;
}
EDIT: A LinkedHashSet
-type class (which I do have a C++ version of, and which I find very useful in other contexts) does not address this problem, because looking up the nth item of a linked list is an O(N) operation and therefore inefficient when used with a large data set.