I am writing a program that compresses and decompresses data using the Huffman Coding algorithm.
About compression: this program will export 2 files: header file (contains characters' frequency for re-constructing the Huffman Code Tree for decompressing data) and actual compressed data.
About decompression: After joining both files above, the program will decompress the data and write to file.
I am a beginner in C++ and would love to hear some feedback about my code style as well as my code design.
Thank you.
hzip.hpp (class for building Huffman Code Tree)
/*
* Author: Harry Nguyen
* Date: 23/05/2018
*/
#ifndef _HZIP_HPP
#define _HZIP_HPP
#include <string>
#include <vector>
#include <map>
#include <queue>
// Data type for a Huffman char
typedef unsigned short hchar_t;
// Data type for frequency
typedef unsigned int hfreq_t;
// Data type for frequency table
typedef std::map<hchar_t, hfreq_t> frmap_t;
// Data type for huffman code map
typedef std::map<hchar_t, std::string> hmap_t;
// Data type for a bit
typedef char bit_t;
const hchar_t PSEUDO_EOF = 256; // A virtual end of file
const hchar_t NOT_A_CHAR = 257; // Write to internal nodes
const bit_t MAX_NUMBER_OF_BITS = 8;
/* Convert string of bits ( "0" or "1")
* to real byte
* Return converted bytes in the form of strings
*/
std::string stobyte(std::string& sBits);
/*
* A Huffman node contains a character, its frequency
* and 2 children node
*/
struct NODE
{
hchar_t _character;
hfreq_t _freq;
NODE* _lChild;
NODE* _rChild;
NODE(hchar_t character, hfreq_t freq):
_character(character), _freq(freq),
_lChild(nullptr), _rChild(nullptr) {}
~NODE() {
delete _lChild;
delete _rChild;
}
};
// ADT for min heap implementation
struct Comparator
{
bool operator()(NODE* lChild, NODE* rChild)
{
return lChild->_freq > rChild->_freq;
}
};
class HZip
{
public:
HZip();
/*
* Build a frequency table from input string
* and return a frequency map.
*/
frmap_t buildFrequencyTable(const std::string& input);
/*
* Build a Huffman tree from frequency table
* and return the root.
*/
NODE* buildEncodingTree(frmap_t freqTable);
/*
* Get encoded map from the root of encoded tree
* and return encoded map.
*/
hmap_t getEncodedMap(NODE* encodingTree);
/*
* Read characters from input string
* and write bytes to output.
* -------------------------------------
* Used for compression
*/
void encodeData(std::string& input, const frmap_t freqMap,
hmap_t encodedMap, std::vector<std::string>& output);
/*
* Construct frequency table from the header
* of the input string.
* ------------------------------------------
* Used for decompression
*/
frmap_t headerProcessing(std::string& input);
/*
* Read bits by bits from the input string
* and write character to the output.
* ----------------------------------------
* Used for decompression
*/
void decodeData(std::string& input, NODE* encodingTree, std::string& output);
protected:
/*
* Build Huffman encoding map.
* ----------------------------
* Recursive approach.
*/
void buildEncodingMap(NODE* encodingTree, std::string sCode);
protected:
hmap_t p_encodedMap;
private:
std::priority_queue<NODE*, std::vector<NODE*>, Comparator> m_minHeap;
};
#endif
hzip.cpp
#include "hzip.hpp"
/* Convert string of bits to bytes
*/
std::string stobyte(std::string& sBits)
{
std::string sBytes;
std::size_t numberOfBits = 0;
bit_t bit = 0;
/* Iterate through string of bits
* If there are not enough 8 bits (to construct 1 byte)
* when reaching the end of string bits then
* 0 will be filled. That's the purpose of "+ numberOfBits"
*/
for(std::size_t i = 0; i < sBits.size() + numberOfBits; ++i) {
// Convert character in form of binary to real bits.
(sBits[i] == '0') ? (bit = 0) : (bit = 1);
// Initialise byteChar once
static char byteChar = 0;
// then get 1 bit
byteChar = (byteChar << 1) | bit;
++numberOfBits;
// If reaching 8 bits (1 byte)
if (numberOfBits == MAX_NUMBER_OF_BITS) {
// then add that byte to the string
sBytes += byteChar;
// and reset byteChar and numberOfBits for the next iteration
byteChar = 0;
numberOfBits = 0;
}
}
return sBytes;
}
HZip::HZip()
{}
frmap_t
HZip::buildFrequencyTable(const std::string& input)
{
frmap_t freqTable;
for (std::size_t i = 0; i < input.size(); ++i)
++freqTable[hchar_t(input[i])]; // cast input[i] from char to hchar_t
// push a PSEDO_EOF to frequency table with freq = 1
freqTable[PSEUDO_EOF] = 1;
return freqTable;
}
NODE*
HZip::buildEncodingTree(frmap_t freqTable)
{
// push each char and its frequency to min heap
for (auto it = freqTable.begin(); it != freqTable.end(); ++it)
m_minHeap.push(new NODE(it->first, it->second));
// push until there is just 1 node left (root)
while (m_minHeap.size() > 1) {
// get least frequency node
NODE* lChild = m_minHeap.top();
m_minHeap.pop();
// get next least frequency node
NODE* rChild = m_minHeap.top();
m_minHeap.pop();
/*
* Make a parent node with
* key is a char or a NOT_A_CHAR which indicates an internal node
* value is the sum of 2 least frequency
*/
NODE* parent = new NODE(NOT_A_CHAR, lChild->_freq + rChild->_freq);
// Link to its children
parent->_lChild = lChild;
parent->_rChild = rChild;
m_minHeap.push(parent);
}
// Top of min heap is the root of built tree
return m_minHeap.top();
}
/*
* Traverse binary tree recursively
* and push char and its huffman code to the map
*/
void
HZip::buildEncodingMap(NODE* encodingTree, std::string sCode)
{
if (!encodingTree)
return ;
// If leaf node
if (encodingTree->_character != NOT_A_CHAR)
p_encodedMap.insert(std::pair<hchar_t, std::string> (encodingTree->_character, sCode));
HZip::buildEncodingMap(encodingTree->_lChild, sCode + "0");
HZip::buildEncodingMap(encodingTree->_rChild, sCode + "1");
}
/*
* Get Huffman coding Map
*/
hmap_t
HZip::getEncodedMap(NODE* encodingTree)
{
HZip::buildEncodingMap(encodingTree, "");
delete encodingTree;
return p_encodedMap;
}
void
HZip::encodeData(std::string& input, const frmap_t freqMap,
hmap_t encodedMap, std::vector<std::string>& output)
{
/*
* Construct the header
* Format: {,<hchart>$<hfreq>...}
*/
std::string header;
header = "{";
for (auto it = freqMap.begin(); it != freqMap.end(); ++it)
header += "," + std::to_string(it->first) + "$" + std::to_string(it->second);
header += "}";
/*
* Reconstruct the document with string of '0' and '1'
*/
std::string sBits;
for (std::size_t i = 0; i < input.size(); ++i) {
sBits += encodedMap[hchar_t(input[i])];
}
// Put a PSEUDO_EOF at the end of the input string
sBits += encodedMap[PSEUDO_EOF];
// Vector output contains 2 parts: the header and actual compressed data
output.push_back(header);
output.push_back(stobyte(sBits));
}
frmap_t
HZip::headerProcessing(std::string& input)
{
frmap_t freqTable;
/* Get header */
std::size_t endOfHeader = input.find_first_of("}");
std::string header = input.substr(0, endOfHeader + 1);
/* Positions for constructing key/value from header */
std::size_t start = header.find_first_of("{,") + 2;
std::size_t end = start;
while (start < header.size()) {
end = header.find_first_of("$", start);
// get character from header
std::string character = header.substr(start, end - start);
hchar_t hCharacter = std::stoi(character); // convert string to integer
start = end + 1;
end = header.find_first_of(",}", start);
// get frequency from header
std::string freq = header.substr(start, end - start);
hfreq_t hFreq = std::stoi(freq); // convert char(freq) to integer
start = end + 1;
// Reconstruct freqTable
freqTable.insert(std::pair<hchar_t, hfreq_t> (hCharacter, hFreq));
}
return freqTable;
}
void
HZip::decodeData(std::string& input, NODE* rootOfEncodedTree, std::string& output)
{
// initialise current node
NODE* curNode = rootOfEncodedTree;
// flag for break 2 loops
bool flagBreak = false;
/* Get position right after the header */
std::size_t endOfHeader = input.find_first_of("}");
std::size_t pos = endOfHeader + 1;
while (pos < input.size()) {
/* Read 1 bit at a time from input */
for (bit_t j = MAX_NUMBER_OF_BITS - 1; j >= 0; --j) {
// Get bit
bit_t bit = (input[pos] >> j) & 0x01;
/* If bit = 0 then go to the left child
* else go to the right child
*/
(bit == 0) ?
(curNode = curNode->_lChild) : (curNode = curNode->_rChild);
// Reach the end of input
if (curNode->_character == PSEUDO_EOF) {
// turn on flagBreak for breaking the while loop
flagBreak = true;
// break the for loop
break;
}
// If reaching leaf node
else if (curNode->_character != NOT_A_CHAR) {
// print node's character and add to the output
output += curNode->_character;
// Scan from root again
curNode = rootOfEncodedTree;
}
}
// Used for break the outer loop (while loop)
if (flagBreak)
break;
// next position of input string
++pos;
}
}
hfile.hpp (Header file for file and string processing)
#ifndef _HFILE_HPP_
#define _HFILE_HPP_
#include "hzip.hpp"
#include <fstream>
#include <sstream>
#include <string>
const unsigned char HEADER = 0;
const unsigned char ACTUAL_COMPRESSED_DATA = 1;
/* Namespace for file extensions */
namespace h_extension {
const std::string hzip = ".hzip";
const std::string har = ".har";
}
/*
* Namespace for path manipulation
* ----------------------------------------
* Can be used for constructing the path of
* compressed and decompressed file
*/
namespace hfile {
std::string getFileName(const std::string& path_to_file);
std::string getParentDicrectory(const std::string& path_to_file);
/* Get the extension of file need to be compressed
* Example: raw file: test.txt -> .txt
* --------------------------------------------------
* Used for constructing the path to compressed file
*/
std::string getSourceFileExtension(const std::string& path_to_file);
/* Get the extension of original file
* Example: compressed file: test.txt.har -> txt
* ---------------------------------------------------
* Used for constructing the path to decompressed file
*/
std::string getOriginalFileExtension(const std::string& path_to_file);
std::string getSrarExtension(const std::string& path_to_file);
}
/* Compress inFile and split into 2 files:
* the header and actual compressed file.
* ---------------------------------------------------
* dest_file_path is the path to the directory you want to write
* your compressed file to, no need to include file name.
*/
void compress(const std::ifstream& inFile,
std::ofstream& outHeaderFile, std::ofstream& outDataFile,
const std::string& source_file_path,
const std::string& dest_file_path,
const std::string& extension);
/* Join 2 files inFile_1 and inFile_2 and write to outFile.
* -----------------------------------------------------------------------
* dest_file_path must include file name you want to create after joining
*/
void joinFile(const std::ifstream& inFile_1, const std::ifstream& inFile_2,
std::ofstream& outFile,
const std::string& source_file_path,
const std::string& dest_file_path);
/* Decompress inFile and write to outFile
* -------------------------------------------------------------------
* dest_file_path is the path to the directory you want to write
* your compressed file to, no need to include file name.
*/
void decompress(const std::ifstream& inFile, std::ofstream& outFile,
const std::string& source_file_path,
const std::string& dest_file_path);
#endif
hfile.cpp
#include "hfile.hpp"
std::string
hfile::getFileName(const std::string& path_to_file)
{
std::size_t start = 0;
/* In case there is no parent directory at all */
if (path_to_file.find_last_of("/") != std::string::npos)
start = path_to_file.find_last_of("/") + 1;
else
start = 0;
std::size_t end = path_to_file.find_first_of(".", start);
std::string file_name = path_to_file.substr(start, end - start);
return file_name;
}
std::string
hfile::getParentDicrectory(const std::string& path_to_file)
{
std::size_t end = 0;
/* In case there is no parent directory at all */
if(path_to_file.find_last_of("/") != std::string::npos)
end = path_to_file.find_last_of("/") + 1;
else
end = 0;
std::string parent_directory = path_to_file.substr(0, end);
return parent_directory;
}
std::string
hfile::getSourceFileExtension(const std::string& path_to_file)
{
std::size_t start = path_to_file.find_last_of(".");
std::size_t end = path_to_file.size();
std::string source_file_extension = path_to_file.substr(start, end - start);
return source_file_extension;
}
std::string
hfile::getOriginalFileExtension(const std::string& path_to_file)
{
std::size_t endOfPath = 0;
/* In case there is no parent directory at all */
if (path_to_file.find_last_of("/") != std::string::npos)
endOfPath = path_to_file.find_last_of("/");
else
endOfPath = 0;
// find the distance between 2 "." in the path
// example: test.txt.har -> 4 (.txt)
std::size_t start = path_to_file.find_first_of(".", endOfPath);
// get next position of "."
std::size_t end = path_to_file.find_first_of(".", start + 1);
std::string original_file_extension = path_to_file.substr(start, end - start);
return original_file_extension;
}
std::string
hfile::getSrarExtension(const std::string& path_to_file)
{
std::size_t start = path_to_file.find_last_of(".");
std::size_t end = path_to_file.size();
std::string srarExtension = path_to_file.substr(start, end - start);
return srarExtension;
}
void compress(const std::ifstream& inFile,
std::ofstream& outHeaderFile, std::ofstream& outDataFile,
const std::string& source_file_path,
const std::string& dest_file_path,
const std::string& extension)
{
/* Read whole string in inFile
* and store to inDoc
*/
std::stringstream buf;
buf << inFile.rdbuf();
std::string inDoc = buf.str();
/* Read more on Huffman coding for more info */
HZip haz;
frmap_t freqMap = haz.buildFrequencyTable(inDoc);
NODE* root = haz.buildEncodingTree(freqMap);
hmap_t encodedMap = haz.getEncodedMap(root);
// data vector contains header and actual compressed data
std::vector<std::string> data;
haz.encodeData(inDoc, freqMap, encodedMap, data);
/* Construct the path of header file
* header_file_path = dest_dir_path + "h@"+ source_filename +
* source_file_extension + huffman_extension
*/
std::string header_file_path = hfile::getParentDicrectory(dest_file_path) + "h@" +
hfile::getFileName(source_file_path) +
hfile::getSourceFileExtension(source_file_path) +
extension;
/* Write to header file */
outHeaderFile.open(header_file_path, std::ios::binary);
outHeaderFile << data[HEADER];
/* Construct the path of actual compressed data file
* similar to header_file_path, except for "d@" + filename instead of "h@"
*/
std::string data_file_path = hfile::getParentDicrectory(dest_file_path) + "d@" +
hfile::getFileName(source_file_path) +
hfile::getSourceFileExtension(source_file_path) +
extension;
/* Write to actual compressed file */
outDataFile.open(data_file_path, std::ios::binary);
outDataFile << data[ACTUAL_COMPRESSED_DATA];
}
void
decompress(const std::ifstream& inFile, std::ofstream& outFile,
const std::string& source_file_path,
const std::string& dest_file_path)
{
/* Read whole string in inFile
* and store to inDoc
*/
std::stringstream buf;
buf << inFile.rdbuf();
std::string inDoc = buf.str();
/* Read more on Huffman coding decompression for more info */
HZip haz;
frmap_t freqTable = haz.headerProcessing(inDoc);
NODE* root = haz.buildEncodingTree(freqTable);
std::string decompressedDoc;
haz.decodeData(inDoc, root, decompressedDoc);
// Free allocated root
delete root;
/* Construct the path of decompressed file
* decompressed_file_path = dest_dir_path + <filename> + source_originalfile_extension
*/
std::string decompressed_file_path = hfile::getParentDicrectory(dest_file_path) +
hfile::getFileName(source_file_path) +
hfile::getOriginalFileExtension(source_file_path);
/* Write to outFile */
outFile.open(decompressed_file_path, std::ios::binary);
outFile << decompressedDoc;
}
void
joinFile(const std::ifstream& inFile_1, const std::ifstream& inFile_2,
std::ofstream& outFile,
const std::string& source_file_path,
const std::string& dest_file_path)
{
/* Read whole inFile_1 and store to string inDoc_1 */
std::stringstream buf1;
buf1 << inFile_1.rdbuf();
std::string inDoc_1 = buf1.str();
/* Read whole inFile_2 and store to string inDoc_2 */
std::stringstream buf2;
buf2 << inFile_2.rdbuf();
std::string inDoc_2 = buf2.str();
// Join 2 strings
std::string outDoc = inDoc_1 + inDoc_2;
// Get raw file name (still contains "h@" and "d@")
std::string source_file_name = hfile::getFileName(source_file_path);
/* Remove "h@" or "d@" for getting actual file name */
std::size_t start = source_file_name.find_first_of("@") + 1;
std::size_t end = source_file_name.size();
std::string joined_file_name = source_file_name.substr(start, end - start);
/* joined_file_path = dest_dir + source_file_name +
* original_source_file_extension + srar_extension
*/
std::string joined_file_path = hfile::getParentDicrectory(dest_file_path) +
joined_file_name +
hfile::getOriginalFileExtension(source_file_path) +
hfile::getSrarExtension(source_file_path);
/* Write joined string to dest_file_path */
outFile.open(joined_file_path, std::ios::binary);
outFile << outDoc;
}