I have been working on sorting a file in-place with heapsort. I'm very glad it is working. But the time it takes is just too much. So I was wondering on how I could improve it.
Here are my requirements:
- The file consists of 14B index entries. One index entry consists of 8B of the actual index and 6B of offset. I want to sort these entries after the first 8B. The remaining 6B will need to be dragged along but will not be required for sorting.
- The sorting has to occur in-place. No additional files may be created
- I need to able to tell the progress. I display a progress bar of the sorting while it runs.
- Some amount of RAM may be used to speed up sorting. The amount will be passed as a parameter
- You can parallize the sorting. But if you do there will be a limit of threads you can create which would also be passed in a parameter if you need it.
- The sorting does not need to be stable.
- You may use a different sorting algorithm
Before I show you my current code I want to add a few notes:
The reason I chose Heapsort over Quicksort is the fact that I want to have a progress bar. I ask you not to debate this requirement.
The files may be a lot larger than the available RAM. That's why you can't load the file entierly into RAM.
The first 8B of each entry are pretty much random and evenly distributed.
My current aproach constits of loading the first index entries into RAM until I reach the limit. And writing that data into the file after the sorting finishes.
Should you find any other issues like bad code style feel free to correct me too.
I use CSI escape sequences to display the progress bar. If you are not familiar with it just leave the code as it is. It is working as intended anyway and I just included it for the sake of completeness.
sortidx.h
#ifndef SORTIDX_H
#define SORTIDX_H
// Includes
#include <algorithm>
#include <atomic>
#include <fstream>
#include <iostream>
#include <limits>
#include <string>
#include <thread>
#include "util.h"
// Functions
// Returns the parent index.
constexpr size_t getParent( size_t i ) {
return (i - 1) / 2;
}
// Returns the left child index.
constexpr size_t getLeft( size_t i ) {
return i * 2 + 1;
}
// Returns the right child index.
constexpr size_t getRight( size_t i ) {
return i * 2 + 2;
}
// Sorts an idx file. Using chachSize bytes of RAM to speed it up.
void sortIDX( std::string idxFile, size_t cacheSize, bool quiet );
// Reads the specified number of elements into the cache
void readIntoCache( size_t numElements );
// Writes the cache to the file
void writeFromCache();
// Turns the idx file into a heap (first step of heapsort)
void heapifyIDX( size_t heapifyLimit );
// Sorts the idx heap (second step of heapsort)
void sortIDXHeap( size_t numDataSets );
// Reads data at the specified location. Either from cache or from disk.
void readData( IndexEntry* entry, size_t pos );
// Writes data at the specified location. Either to cache or to disk.
void writeData( IndexEntry* entry, size_t pos );
// Checks whether a index is in the heap
bool isInHeap( size_t pos );
// Moves a element down the heap until it is at the right position
void orderHeap( IndexEntry &top, size_t posTop );
#endif
sortidx.cpp
#include "sortidx.h"
using namespace std;
streampos fileSize;
size_t numDataSets;
size_t limit;
atomic<size_t> pos;
fstream* file;
size_t arraySize = 0;
IndexEntry* cacheArray;
void readIntoCache( size_t numElements ) {
if ( arraySize != 0 )
writeFromArray();
arraySize = numElements;
cacheArray = new IndexEntry[arraySize];
file->seekg( 0 );
for ( size_t i = 0; i < arraySize; i++ ) {
file->read( (char*)(cacheArray + i), writeSize );
}
}
void writeFromCache() {
file->seekp( 0 );
for ( size_t i = 0; i < arraySize; i++ ) {
file->write( (char*)(cacheArray + i), writeSize );
}
arraySize = 0;
delete[] cacheArray;
}
void sortIDX( string idxFile, size_t cacheSize, bool quiet ) {
file = new fstream( idxFile, ios::in | ios::out | ios::binary | ios::ate );
fileSize = file->tellg();
numDataSets = fileSize / writeSize;
limit = numDataSets - 1;
const size_t localLimit = limit;
const size_t heapifyLimit = getParent( limit );
thread* sorterThread;
if ( !quiet )
cout << "Sorting index (may take a while)...\n\33[sLoading cache from file..." << flush;
cacheSize /= writeSize;
readIntoCache( min(cacheSize, numDataSets) );
sorterThread = new thread( heapifyIDX, heapifyLimit );
if ( !quiet ) {
cout << "\33[u";
initProgress( heapifyLimit + localLimit, false );
while ( pos <= heapifyLimit ) {
this_thread::sleep_for( chrono::milliseconds( defaultTimeout ) );
printProgress( (size_t)pos );
}
}
sorterThread->join();
delete sorterThread;
pos = 0;
sorterThread = new thread( sortIDXHeap, localLimit );
if ( !quiet ) {
while ( pos < localLimit ) {
this_thread::sleep_for( chrono::milliseconds( defaultTimeout ) );
printProgress( heapifyLimit + pos );
}
}
sorterThread->join();
delete sorterThread;
if ( !quiet )
cout << "\33[?25h\n\33[sSaving cache to file." << flush;
writeFromCache();
file->close();
delete file;
if ( !quiet )
cout << "\33[u\33[KDone!" << endl;
}
void heapifyIDX( size_t heapifyLimit ) {
IndexEntry top;
size_t posTop;
for ( pos = 0; pos <= heapifyLimit; pos++ ) {
posTop = heapifyLimit - pos;
readData( &top, posTop );
orderHeap( top, posTop );
}
}
void sortIDXHeap( size_t numDataSets ) {
IndexEntry last;
IndexEntry top;
size_t posLast;
size_t posTop;
for ( pos = 0; pos < numDataSets; pos++ ) {
posLast = numDataSets - pos;
posTop = 0;
limit = posLast - 1;
readData( &last, posTop );
readData( &top, posLast );
writeData( &last, posLast );
orderHeap( top, posTop );
}
}
void readData( IndexEntry* entry, size_t pos ) {
if ( pos < arraySize ) {
*entry = cacheArray[pos];
} else {
file->seekg( pos * writeSize );
file->read( (char*)entry, writeSize );
}
}
void writeData( IndexEntry* entry, size_t pos ) {
if ( pos < arraySize ) {
cacheArray[pos] = *entry;
} else {
file->seekp( pos * writeSize );
file->write( (char*)entry, writeSize );
}
}
bool isInHeap( size_t pos ) {
return pos <= limit;
}
void orderHeap( IndexEntry &top, size_t posTop ) {
static IndexEntry left;
static IndexEntry right;
static size_t posLeft;
static size_t posRight;
static bool swapped;
do {
posLeft = getLeft( posTop );
posRight = getRight( posTop );
if ( isInHeap( posLeft ) ) {
readData( &left, posLeft );
if ( isInHeap( posRight ) ) {
readData( &right, posRight );
if ( right > left ) {
if ( right > top ) {
writeData( &right, posTop );
posTop = posRight;
swapped = true;
} else {
swapped = false;
}
} else {
if ( left > top ) {
writeData( &left, posTop );
posTop = posLeft;
swapped = true;
} else {
swapped = false;
}
}
} else {
if ( left > top ) {
writeData( &left, posTop );
posTop = posLeft;
swapped = true;
} else {
swapped = false;
}
}
} else {
swapped = false;
}
} while ( swapped );
writeData( &top, posTop );
}
util.h
#ifndef UTIL_H
#define UTIL_H
// Includes
#include <iomanip>
#include <iostream>
#include <stddef.h>
#include <sstream>
#include <string>
#include <thread>
#include <unistd.h>
#include <sys/ioctl.h>
// Constants
constexpr size_t hashSize = 8;
constexpr size_t offsetSize = 6;
constexpr size_t writeSize = hashSize + offsetSize;
constexpr long long defaultTimeout = 100;
struct IndexEntry {
unsigned char hash[hashSize]; // First 64 bits of the hash
unsigned char position[offsetSize]; // Position of word in dictionary (48-bit little endian integer)
IndexEntry& operator=( const IndexEntry ©From );
bool operator>( const IndexEntry &lhs );
} __attribute__( (__packed__) );
// Functions
struct winsize getConsoleSize();
unsigned short getConsoleHeight();
unsigned short getConsoleWidth();
// Determines which byte postfix to use (0 = "B", 1 = "KiB", ...)
unsigned short getBytePower( std::streampos size );
// Returns the appropriate byte postfix
std::string getBytePowerPostfix( unsigned short power );
// Formats the size. If power is -1 it automatically detects the power
std::string getFormatedSize( std::streampos size, int power = -1 );
// Initializes the progress bar
void initProgress( std::streampos fileSize, bool withFileSize );
// Prints the progressbar
void printProgress( std::streampos currentPos );
#endif
util.cpp
#include "util.h"
using namespace std;
streampos totalFileSize;
unsigned short formatPower;
string fileSizeString;
bool renderWithFileSize;
IndexEntry& IndexEntry::operator=( const IndexEntry ©From ) {
size_t i;
for ( i = 0; i < hashSize; i++ )
hash[i] = copyFrom.hash[i];
for ( i = 0; i < offsetSize; i++ )
position[i] = copyFrom.position[i];
return *this;
}
bool IndexEntry::operator>( const IndexEntry &lhs ) {
for ( size_t i = 0; i < hashSize; i++ ) {
if ( hash[i] > lhs.hash[i] )
return true;
else if ( hash[i] < lhs.hash[i] )
return false;
}
return false;
}
struct winsize getConsoleSize() {
struct winsize size;
ioctl( STDOUT_FILENO, TIOCGWINSZ, &size );
return size;
}
unsigned short getConsoleHeight() {
return getConsoleSize().ws_row;
}
unsigned short getConsoleWidth() {
return getConsoleSize().ws_col;
}
unsigned short getBytePower( streampos size ) {
unsigned short power;
for ( power = 0; size >= 1000; power++ )
size = size >> 10;
return power;
}
string getBytePowerPostfix( unsigned short power ) {
static const string postfixes[] = { " B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" };
static constexpr size_t numPostfixes = sizeof( postfixes ) / sizeof( string );
if ( power > numPostfixes ) {
return string( "2^" ) + to_string( power * 10 ) + postfixes[0];
} else {
return postfixes[power];
}
}
std::string getFormatedSize( std::streampos size, int power ) {
unsigned short formatPower = (power <= -1) ? getBytePower( size ) : (unsigned short)power;
stringstream ss;
if ( power == 0 ) {
ss << setw( 3 ) << size << " ";
} else {
ss << setw( 7 ) << fixed << setprecision( 3 ) << double( size ) / double( 1 << (10 * power) );
}
ss << ' ' << getBytePowerPostfix( formatPower );
return ss.str();
}
void initProgress( streampos fileSize, bool withFileSize ) {
totalFileSize = fileSize;
formatPower = getBytePower( fileSize );
fileSizeString = getFormatedSize( fileSize, formatPower );
renderWithFileSize = withFileSize;
cout << "\33[?25l";
}
void printProgress( streampos currentPos ) {
int barWidth = getConsoleWidth() - (renderWithFileSize ? 35 : 9);
double progress = (double)currentPos / totalFileSize;
cout << "\33[s\33[K[";
int pos = barWidth * progress;
for ( int i = 0; i < barWidth; ++i ) {
if ( i < pos ) cout << '=';
else if ( i == pos ) cout << '>';
else cout << ' ';
}
cout << "] " << setw( 5 ) << fixed << setprecision( 1 ) << progress * 100.0 << '%';
if ( renderWithFileSize )
cout << ' ' << getFormatedSize( currentPos, formatPower ) << " / " << fileSizeString;
cout << "\33[u" << flush;
}
Example main.cpp
#include "sortidx.h"
int main() {
sortIDX( "indexFile.idx", 256 * 1024 * 1024, false );
}
main()with example usage. Currently it is just a jumbled set of functions. \$\endgroup\$C with Classesbut the style is not a modern C++ style which is much more declarative than C. I will go into detail in my review. \$\endgroup\$