5
\$\begingroup\$

I am writing a program that compresses and decompresses data using the Huffman Coding algorithm.

About compression: this program will export 2 files: header file (contains characters' frequency for re-constructing the Huffman Code Tree for decompressing data) and actual compressed data.

About decompression: After joining both files above, the program will decompress the data and write to file.

I am a beginner in C++ and would love to hear some feedback about my code style as well as my code design.

Thank you.

hzip.hpp (class for building Huffman Code Tree)

/*
* Author: Harry Nguyen
* Date: 23/05/2018
*/ 

#ifndef _HZIP_HPP
#define _HZIP_HPP

#include <string>
#include <vector>
#include <map>
#include <queue>

// Data type for a Huffman char
typedef unsigned short hchar_t; 

// Data type for frequency 
typedef unsigned int hfreq_t;

// Data type for frequency table
typedef std::map<hchar_t, hfreq_t> frmap_t; 

// Data type for huffman code map
typedef std::map<hchar_t, std::string> hmap_t; 

// Data type for a bit
typedef char bit_t;

const hchar_t PSEUDO_EOF = 256; // A virtual end of file 
const hchar_t NOT_A_CHAR = 257; // Write to internal nodes
const bit_t MAX_NUMBER_OF_BITS = 8; 

/* Convert string of bits ( "0" or "1") 
*  to real byte 
*  Return converted bytes in the form of strings
*/
std::string stobyte(std::string& sBits);

/*
* A Huffman node contains a character, its frequency
* and 2 children node
*/
struct NODE
{
    hchar_t _character;
    hfreq_t _freq;

    NODE* _lChild;
    NODE* _rChild;

    NODE(hchar_t character, hfreq_t freq):
        _character(character), _freq(freq),
        _lChild(nullptr), _rChild(nullptr) {}

    ~NODE() {
        delete _lChild;
        delete _rChild;
    }
};

// ADT for min heap implementation
struct Comparator
{
    bool operator()(NODE* lChild, NODE* rChild) 
    {
        return lChild->_freq > rChild->_freq;
    }
};

class HZip
{
public:
    HZip();

    /*
    * Build a frequency table from input string
    * and return a frequency map.
    */
    frmap_t buildFrequencyTable(const std::string& input);

    /* 
    * Build a Huffman tree from frequency table
    * and return the root.
    */
    NODE* buildEncodingTree(frmap_t freqTable);

    /*
    * Get encoded map from the root of encoded tree
    * and return encoded map.
    */
    hmap_t getEncodedMap(NODE* encodingTree);

    /*
    * Read characters from input string 
    * and write bytes to output.
    * -------------------------------------
    * Used for compression
    */
    void encodeData(std::string& input, const frmap_t freqMap, 
                    hmap_t encodedMap, std::vector<std::string>& output);

    /*
    * Construct frequency table from the header 
    * of the input string.
    * ------------------------------------------
    * Used for decompression
    */
    frmap_t headerProcessing(std::string& input);

    /*
    * Read bits by bits from the input string
    * and write character to the output.
    * ----------------------------------------
    * Used for decompression
    */
    void decodeData(std::string& input, NODE* encodingTree, std::string& output);

protected:
    /*
    * Build Huffman encoding map.
    * ----------------------------
    * Recursive approach.
    */
    void buildEncodingMap(NODE* encodingTree, std::string sCode);

protected:
    hmap_t p_encodedMap;

private:
    std::priority_queue<NODE*, std::vector<NODE*>, Comparator> m_minHeap;
};

#endif

hzip.cpp

#include "hzip.hpp"

/* Convert string of bits to bytes
*/
std::string stobyte(std::string& sBits)
{
    std::string sBytes;

    std::size_t numberOfBits = 0;
    bit_t bit = 0;

    /* Iterate through string of bits 
    * If there are not enough 8 bits (to construct 1 byte) 
    * when reaching the end of string bits then
    * 0 will be filled. That's the purpose of "+ numberOfBits"
    */
    for(std::size_t i = 0; i < sBits.size() + numberOfBits; ++i) {

        // Convert character in form of binary to real bits.
        (sBits[i] == '0') ? (bit = 0) : (bit = 1);

        // Initialise byteChar once
        static char byteChar = 0;

        // then get 1 bit
        byteChar = (byteChar << 1) | bit;

        ++numberOfBits;

        // If reaching 8 bits (1 byte)
        if (numberOfBits == MAX_NUMBER_OF_BITS) {
            // then add that byte to the string
            sBytes += byteChar;

            // and reset byteChar and numberOfBits for the next iteration
            byteChar = 0;
            numberOfBits = 0;
        }
    }

    return sBytes;
}


HZip::HZip()
{}

frmap_t
HZip::buildFrequencyTable(const std::string& input)
{
    frmap_t freqTable;

    for (std::size_t i = 0; i < input.size(); ++i)
        ++freqTable[hchar_t(input[i])]; // cast input[i] from char to hchar_t 

    // push a PSEDO_EOF to frequency table with freq = 1
    freqTable[PSEUDO_EOF] = 1;

    return freqTable;
}

NODE* 
HZip::buildEncodingTree(frmap_t freqTable)
{
    // push each char and its frequency to min heap
    for (auto it = freqTable.begin(); it != freqTable.end(); ++it)
        m_minHeap.push(new NODE(it->first, it->second));

    // push until there is just 1 node left (root)
    while (m_minHeap.size() > 1) {
        // get least frequency node
        NODE* lChild = m_minHeap.top();
        m_minHeap.pop();

        // get next least frequency node
        NODE* rChild = m_minHeap.top();
        m_minHeap.pop();

        /*
        * Make a parent node with
        * key is a char or a NOT_A_CHAR which indicates an internal node
        * value is the sum of 2 least frequency
        */
        NODE* parent = new NODE(NOT_A_CHAR, lChild->_freq + rChild->_freq);

        // Link to its children
        parent->_lChild = lChild;
        parent->_rChild = rChild;

        m_minHeap.push(parent);
    }

    // Top of min heap is the root of built tree
    return m_minHeap.top();
}

/*
* Traverse binary tree recursively
* and push char and its huffman code to the map
*/
void 
HZip::buildEncodingMap(NODE* encodingTree, std::string sCode)
{
    if (!encodingTree) 
        return ;

    // If leaf node
    if (encodingTree->_character != NOT_A_CHAR) 
        p_encodedMap.insert(std::pair<hchar_t, std::string> (encodingTree->_character, sCode));

    HZip::buildEncodingMap(encodingTree->_lChild, sCode + "0");
    HZip::buildEncodingMap(encodingTree->_rChild, sCode + "1");
}

/*
* Get Huffman coding Map
*/
hmap_t
HZip::getEncodedMap(NODE* encodingTree)
{
    HZip::buildEncodingMap(encodingTree, "");
    delete encodingTree;
    return p_encodedMap;
}

void
HZip::encodeData(std::string& input, const frmap_t freqMap, 
                 hmap_t encodedMap, std::vector<std::string>& output)
{
    /*
    * Construct the header 
    * Format: {,<hchart>$<hfreq>...}
    */
    std::string header;
    header = "{";
    for (auto it = freqMap.begin(); it != freqMap.end(); ++it) 
        header += "," + std::to_string(it->first) + "$" + std::to_string(it->second);
    header += "}";

    /*
    * Reconstruct the document with string of '0' and '1'
    */
    std::string sBits;
    for (std::size_t i = 0; i < input.size(); ++i) {
        sBits += encodedMap[hchar_t(input[i])];
    }
    // Put a PSEUDO_EOF at the end of the input string
    sBits += encodedMap[PSEUDO_EOF];

    // Vector output contains 2 parts: the header and actual compressed data
    output.push_back(header);
    output.push_back(stobyte(sBits));
}

frmap_t
HZip::headerProcessing(std::string& input)
{
    frmap_t freqTable;

    /* Get header */
    std::size_t endOfHeader = input.find_first_of("}");
    std::string header = input.substr(0, endOfHeader + 1);

    /* Positions for constructing key/value from header */
    std::size_t start = header.find_first_of("{,") + 2;
    std::size_t end = start;

    while (start < header.size()) {
        end = header.find_first_of("$", start);
        // get character from header
        std::string character = header.substr(start, end - start); 
        hchar_t hCharacter = std::stoi(character); // convert string to integer
        start = end + 1;


        end = header.find_first_of(",}", start);
        // get frequency from header
        std::string freq = header.substr(start, end - start);
        hfreq_t hFreq = std::stoi(freq); // convert char(freq) to integer
        start = end + 1;

        // Reconstruct freqTable
        freqTable.insert(std::pair<hchar_t, hfreq_t> (hCharacter, hFreq));
    }

    return freqTable;
}

void 
HZip::decodeData(std::string& input, NODE* rootOfEncodedTree, std::string& output)
{
    // initialise current node 
    NODE* curNode = rootOfEncodedTree;

    // flag for break 2 loops
    bool flagBreak = false;

    /* Get position right after the header */
    std::size_t endOfHeader = input.find_first_of("}");
    std::size_t pos = endOfHeader + 1;

    while (pos < input.size()) {
        /* Read 1 bit at a time from input */
        for (bit_t j = MAX_NUMBER_OF_BITS - 1; j >= 0; --j) {
            // Get bit
            bit_t bit = (input[pos] >> j) & 0x01;

            /* If bit = 0 then go to the left child
            * else go to the right child
            */
            (bit == 0) ? 
                (curNode = curNode->_lChild) : (curNode = curNode->_rChild);

            // Reach the end of input
            if (curNode->_character == PSEUDO_EOF) {
                // turn on flagBreak for breaking the while loop
                flagBreak = true;

                // break the for loop
                break;
            }

            // If reaching leaf node
            else if (curNode->_character != NOT_A_CHAR) {
                // print node's character and add to the output
                output += curNode->_character;

                // Scan from root again
                curNode = rootOfEncodedTree;
            }
        }

        // Used for break the outer loop (while loop)
        if (flagBreak)
            break;

        // next position of input string
        ++pos;
    }
}

hfile.hpp (Header file for file and string processing)

#ifndef _HFILE_HPP_
#define _HFILE_HPP_

#include "hzip.hpp"

#include <fstream>
#include <sstream>
#include <string>

const unsigned char HEADER = 0;
const unsigned char ACTUAL_COMPRESSED_DATA = 1;

/* Namespace for file extensions */
namespace h_extension {
    const std::string hzip = ".hzip";
    const std::string har = ".har";
}

/*
*  Namespace for path manipulation
*  ----------------------------------------
*  Can be used for constructing the path of 
*  compressed and decompressed file
*/
namespace hfile {
    std::string getFileName(const std::string& path_to_file);
    std::string getParentDicrectory(const std::string& path_to_file);

    /* Get the extension of file need to be compressed
    *  Example: raw file: test.txt -> .txt
    *  --------------------------------------------------
    *  Used for constructing the path to compressed file
    */
    std::string getSourceFileExtension(const std::string& path_to_file);

    /* Get the extension of original file 
    *  Example: compressed file: test.txt.har -> txt
    *  ---------------------------------------------------
    * Used for constructing the path to decompressed file
    */ 
    std::string getOriginalFileExtension(const std::string& path_to_file);

    std::string getSrarExtension(const std::string& path_to_file);
}

/* Compress inFile and split into 2 files: 
*          the header and actual compressed file.
*  ---------------------------------------------------
*  dest_file_path is the path to the directory you want to write 
*  your compressed file to, no need to include file name.
*/
void compress(const std::ifstream& inFile, 
              std::ofstream& outHeaderFile, std::ofstream& outDataFile, 
              const std::string& source_file_path, 
              const std::string& dest_file_path,  
              const std::string& extension);


/* Join 2 files inFile_1 and inFile_2 and write to outFile.
*  -----------------------------------------------------------------------
*  dest_file_path must include file name you want to create after joining
*/
void joinFile(const std::ifstream& inFile_1, const std::ifstream& inFile_2, 
              std::ofstream& outFile, 
              const std::string& source_file_path,
              const std::string& dest_file_path);


/* Decompress inFile and write to outFile 
*  -------------------------------------------------------------------
*  dest_file_path is the path to the directory you want to write 
*  your compressed file to, no need to include file name.
*/
void decompress(const std::ifstream& inFile, std::ofstream& outFile, 
                const std::string& source_file_path, 
                const std::string& dest_file_path);

#endif

hfile.cpp

#include "hfile.hpp"

std::string 
hfile::getFileName(const std::string& path_to_file)
{
    std::size_t start = 0;

    /* In case there is no parent directory at all */
    if (path_to_file.find_last_of("/") != std::string::npos) 
        start = path_to_file.find_last_of("/") + 1;
    else 
        start = 0;

    std::size_t end = path_to_file.find_first_of(".", start);

    std::string file_name = path_to_file.substr(start, end - start);

    return file_name;
}

std::string 
hfile::getParentDicrectory(const std::string& path_to_file) 
{
    std::size_t end = 0;

    /* In case there is no parent directory at all */
    if(path_to_file.find_last_of("/") != std::string::npos)
        end = path_to_file.find_last_of("/") + 1;
    else 
        end = 0;

    std::string parent_directory = path_to_file.substr(0, end);

    return parent_directory;
}

std::string
hfile::getSourceFileExtension(const std::string& path_to_file) 
{
    std::size_t start = path_to_file.find_last_of(".");
    std::size_t end = path_to_file.size();

    std::string source_file_extension = path_to_file.substr(start, end - start);

    return source_file_extension;
}

std::string 
hfile::getOriginalFileExtension(const std::string& path_to_file) 
{
    std::size_t endOfPath = 0;

    /* In case there is no parent directory at all */
    if (path_to_file.find_last_of("/") != std::string::npos) 
        endOfPath = path_to_file.find_last_of("/");
    else
        endOfPath = 0;

    // find the distance between 2 "." in the path
    // example: test.txt.har -> 4 (.txt)
    std::size_t start = path_to_file.find_first_of(".", endOfPath);

    // get next position of "." 
    std::size_t end = path_to_file.find_first_of(".", start + 1);

    std::string original_file_extension = path_to_file.substr(start, end - start);
    return original_file_extension;
}

std::string 
hfile::getSrarExtension(const std::string& path_to_file)
{
    std::size_t start = path_to_file.find_last_of(".");
    std::size_t end = path_to_file.size();

    std::string srarExtension = path_to_file.substr(start, end - start);

    return srarExtension;
}

void compress(const std::ifstream& inFile, 
              std::ofstream& outHeaderFile, std::ofstream& outDataFile, 
              const std::string& source_file_path, 
              const std::string& dest_file_path, 
              const std::string& extension) 
{
    /* Read whole string in inFile 
    *  and store to inDoc
    */
    std::stringstream buf;
    buf << inFile.rdbuf();
    std::string inDoc = buf.str();

    /* Read more on Huffman coding for more info */
    HZip haz;
    frmap_t freqMap = haz.buildFrequencyTable(inDoc);
    NODE* root = haz.buildEncodingTree(freqMap);
    hmap_t encodedMap = haz.getEncodedMap(root);

    // data vector contains header and actual compressed data
    std::vector<std::string> data;

    haz.encodeData(inDoc, freqMap, encodedMap, data);

    /* Construct the path of header file
    *  header_file_path = dest_dir_path + "h@"+ source_filename + 
    *                     source_file_extension + huffman_extension
    */
    std::string header_file_path = hfile::getParentDicrectory(dest_file_path) + "h@" +
                                   hfile::getFileName(source_file_path) + 
                                   hfile::getSourceFileExtension(source_file_path) +
                                   extension;

    /* Write to header file */
    outHeaderFile.open(header_file_path, std::ios::binary);
    outHeaderFile << data[HEADER];

    /* Construct the path of actual compressed data file
    *  similar to header_file_path, except for "d@" + filename instead of "h@"
    */
    std::string data_file_path = hfile::getParentDicrectory(dest_file_path) + "d@" +
                                 hfile::getFileName(source_file_path) + 
                                 hfile::getSourceFileExtension(source_file_path) +
                                 extension;

    /* Write to actual compressed file */
    outDataFile.open(data_file_path, std::ios::binary);
    outDataFile << data[ACTUAL_COMPRESSED_DATA];
}

void 
decompress(const std::ifstream& inFile, std::ofstream& outFile, 
           const std::string& source_file_path,
           const std::string& dest_file_path)
{
    /* Read whole string in inFile 
    *  and store to inDoc
    */
    std::stringstream buf;
    buf << inFile.rdbuf();
    std::string inDoc = buf.str();

    /* Read more on Huffman coding decompression for more info */
    HZip haz;
    frmap_t freqTable = haz.headerProcessing(inDoc);
    NODE* root = haz.buildEncodingTree(freqTable);

    std::string decompressedDoc;
    haz.decodeData(inDoc, root, decompressedDoc);

    // Free allocated root
    delete root;

    /* Construct the path of decompressed file
    *  decompressed_file_path = dest_dir_path + <filename> + source_originalfile_extension
    */
    std::string decompressed_file_path = hfile::getParentDicrectory(dest_file_path) + 
                                         hfile::getFileName(source_file_path) + 
                                         hfile::getOriginalFileExtension(source_file_path);

    /* Write to outFile */
    outFile.open(decompressed_file_path, std::ios::binary);
    outFile << decompressedDoc;
}

void 
joinFile(const std::ifstream& inFile_1, const std::ifstream& inFile_2, 
         std::ofstream& outFile, 
         const std::string& source_file_path,
         const std::string& dest_file_path)
{
    /* Read whole inFile_1 and store to string inDoc_1 */
    std::stringstream buf1;
    buf1 << inFile_1.rdbuf();
    std::string inDoc_1 = buf1.str();

    /* Read whole inFile_2 and store to string inDoc_2 */
    std::stringstream buf2;
    buf2 << inFile_2.rdbuf();
    std::string inDoc_2 = buf2.str();

    // Join 2 strings
    std::string outDoc = inDoc_1 + inDoc_2;

    // Get raw file name (still contains "h@" and "d@")
    std::string source_file_name = hfile::getFileName(source_file_path);

    /* Remove "h@" or "d@" for getting actual file name */
    std::size_t start = source_file_name.find_first_of("@") + 1;
    std::size_t end = source_file_name.size();
    std::string joined_file_name = source_file_name.substr(start, end - start);

    /* joined_file_path = dest_dir + source_file_name + 
    *                     original_source_file_extension + srar_extension 
    */
    std::string joined_file_path = hfile::getParentDicrectory(dest_file_path) + 
                                   joined_file_name + 
                                   hfile::getOriginalFileExtension(source_file_path) +
                                   hfile::getSrarExtension(source_file_path);

    /* Write joined string to dest_file_path */
    outFile.open(joined_file_path, std::ios::binary);
    outFile << outDoc;
}
\$\endgroup\$
3
  • \$\begingroup\$ It's great that you're learning about compression, but this is not encryption. It's very important that you don't mix them up. \$\endgroup\$ Commented May 27, 2018 at 8:43
  • \$\begingroup\$ @Reinderien Thank you! Because I divide the compressed files into 2 files, and I guess that the compressed file could not be decompressed if there is no header. So I thought it was kind of encryption. \$\endgroup\$ Commented May 27, 2018 at 8:52
  • \$\begingroup\$ It's better than transmitting something plaintext, but only slightly. This would still be vulnerable to attack. If you want to understand why, I suggest that you open that as a separate question on the SE security site. \$\endgroup\$ Commented May 27, 2018 at 9:01

2 Answers 2

3
\$\begingroup\$

A few style notes:

        (sBits[i] == '0') ? (bit = 0) : (bit = 1);

The above is better as the following.

        bit = sBits[i] == '0' ? 0 : 1;

In several places in your code you assign to a variable and then immediately return that variable. It's a bit extraneous.

std::string 
hfile::getSrarExtension(const std::string& path_to_file)
{
    std::size_t start = path_to_file.find_last_of(".");
    std::size_t end = path_to_file.size();

    std::string srarExtension = path_to_file.substr(start, end - start);

    return srarExtension;
}

Could be:

std::string 
hfile::getSrarExtension(const std::string& path_to_file)
{
    std::size_t start = path_to_file.find_last_of(".");
    std::size_t end = path_to_file.size();

    return path_to_file.substr(start, end - start);
}

Your typedefs are better expressing using using.

// Data type for a Huffman char
typedef unsigned short hchar_t; 

// Data type for frequency 
typedef unsigned int hfreq_t;

// Data type for frequency table
typedef std::map<hchar_t, hfreq_t> frmap_t; 

// Data type for huffman code map
typedef std::map<hchar_t, std::string> hmap_t; 

// Data type for a bit
typedef char bit_t;

Becomes:

// Data type for a Huffman char
using hchar_t = unsigned short; 

// Data type for frequency 
using hfreq_t = hfreq_t;

// Data type for frequency table
using frmap_t = std::map<hchar_t, hfreq_t>; 

// Data type for huffman code map
using hmap_t = std::map<hchar_t, std::string>; 

// Data type for a bit
using bit_t = char;
\$\endgroup\$
3
\$\begingroup\$

I’m going to do the review in logical order from the top down, rather than in source order. So the highest-level stuff will be first, then we’ll drill down into the lower-level stuff.

First a couple of general observations:

  • There is quite a bit of stuff in your code that is unnecessary, because it is already provided by the standard library. In particular, the path manipulation stuff is entirely superfluous (and dangerous!).
  • There is a lack of encapsulation; basically everything is hangin’ out in the open. You need more types, to wrap more stuff, and keep more internal details as internal details. Mostly this is an aesthetic issue, but it can lead to headaches in more complex programs. And it can also lead to very real dangers in this particular case because….
  • You don’t use smart pointers. This is not acceptable in modern C++.
  • For bit sequences, you use std::string, full of 0 and 1. That is… suboptimal, to say the least. It is also error-prone (what if there is a 2 in the string?). If you want a dynamic bitset, use a dynamic bitset type. The standard library has std::vector<bool>, but there are better options out there, like in Boost. But you may not need a fully dynamic bitset, because the size of your bitsets are limited.

Okay, let’s start with the top-level functions.

Top-level functions

So your library interface is basically:

void compress(in_stream, header_out_stream, data_out_stream, source_file_path, dest_file_path, extension);
void joinFile(in1_stream, in2_stream, out_stream, source_file_path, dest_file_path);
void decompress(in_stream, out_stream, source_file_path, dest_file_path);

This seems a bit… overcomplicated.

If all I want to do is compress a file, why can’t I just do:

compress(in_path, out_dir, filename);

So I could do:

compress("data/stuff.txt", "path/to/out", "stuff.compressed");

… and that would compress data/stuff.txt to path/to/out/[email protected] and path/to/out/[email protected].

Why should I also have to create three streams, and separately decide on a file extension?

It’s a similar idea for the join function. Why can’t I just do:

join("path/to/[email protected]", "path/to/[email protected]", "output/path/optional-file-name");

Or even:

join("path/to/compressed/data", "stuff.data", "output/path/optional-file-name");

… and it will look for path/to/compressed/data/[email protected] and path/to/compressed/data/[email protected] automatically, and raise an error if one (or both) is missing.

Let’s go back to the compress() function. What is the minimal interface needed? Well, it’s probably what I showed above: just the input filename, the output directory, and the output filename without the prefixes. Okay, but what if you want more control, for whatever reason? For example, what if you want to put the header and the data output files in different directories?

For that, all you’d need is a second function that just takes three streams. The one that takes paths could even call the stream-based function internally:

auto compresss(std::istream& src, std::ostream& header, std::ostream& data) -> void;

auto compresss(std::filesystem::path const& src, std::filesystem::path const& outdir, std::string_view filename) -> void
{
    auto const header_path = /* generate header path */;
    auto const data_path   = /* generate data path */;

    compress(
        std::ifstream{src, std::ios_base::binary},
        std::ofstream{header_path, std::ios_base::binary},
        std::ofstream{data_path, std::ios_base::binary}
    );
}

Now, if the user wants, they can do stuff like:

compress(
    open_input_stream_from_url("https://whatev.com/data.txt",),
    std::ofstream{"private/save-header-locally.txt", std::ios_base::binary},
    open_stream_to_send_via_email("[email protected]")
);

Or just about anything else you can imagine.

In particular, manually specifying that the streams must be file streams is unnecessarily over-restrictive. Forcing the user to provide those streams in addition to file names is too burdensome. Either-or is fine: they can provide just the filenames, or just the streams. Technically, even the filenames is unnecessary, but you want a default where d@ and h@ prefixes are used, so fine.

Another alternative is to drop the overload with paths entirely and just have the streams, then have a set of functions that compute the paths. For example;

struct compression_paths
{
    std::filesystem::path src;
    std::filesystem::path header;
    std::filesystem::path data;
};

// The simplest case!
auto compute_compression_paths(std::filesystem::path src) -> compression_paths
{
    auto paths = compression_paths{};
    paths.src = std::move(src);

    auto const dir = paths.src.parent_path();
    auto const base = paths.src.filename().string();

    paths.header = dir / ("h@" + base);
    paths.data   = dir / ("d@" + base);

    return paths;
}

auto compute_compression_paths(std::filesystem::path src, std::filesystem::path const& dest) -> compression_paths
{
    auto paths = compression_paths{};
    paths.src = std::move(src);

    if (is_directory(paths.src))
    {
        // The destination is a directory, so compute the filenames
        // using the source filename.

        auto const base = paths.src.filename().string();

        paths.header = dest / ("h@" + base);
        paths.data   = dest / ("d@" + base);
    }
    else
    {
        // The destination is not a directory (it's a file or
        // something else or non-existent), so compute the filename
        // using the last part of the destination path.

        auto const dir = dest.parent_path();
        auto const base = dest.filename().string();

        paths.header = dir / ("h@" + base);
        paths.data   = dir / ("d@" + base);
    }

    return paths;
}

// And similarly to compute the *de*compression path:
auto compute_compression_paths(std::filesystem::path src) -> compression_paths
{
    // "src" could by the header or the data or neither; we'll check.
    auto filename = src.filename().string();

    auto const is_header = filename.starts_with("h@");
    if (not is_header and not filename.starts_with("d@"))
        throw cant_figure_out_filename{};
    filename.erase(0, 2); // erase the "?@";

    auto temp = src;
    return compression_paths{
        .src = src.parent_path() / filename,
        .header = is_header ? std::move(src) : temp.replace_filename("h@" + filename),
        .data = not is_header ? std::move(src) : temp.replace_filename("d@" + filename)
    }
}

// ... etc.

// Usage:
auto const [src, header, data] = compute_compression_paths("data/src.dat");
compress(src, header, data);
// produces: data/[email protected], data/[email protected]

Now let’s dive into the actual code:

void compress(const std::ifstream& inFile, 
              std::ofstream& outHeaderFile, std::ofstream& outDataFile, 
              const std::string& source_file_path, 
              const std::string& dest_file_path, 
              const std::string& extension) 
{
    /* Read whole string in inFile 
    *  and store to inDoc
    */
    std::stringstream buf;
    buf << inFile.rdbuf();
    std::string inDoc = buf.str();

This is not a great way to read a whole file into a string, because it stores a copy of the entire file within buf… and then copies it so you have another copy of the entire file within inDoc. Remember, buf is not destroyed until the end of the function.

There are two ways to improve this.

First, you could move the string out of buf into inDoc, like so:

    auto const inDoc = std::move(buf).str();

The other, and probably better, option is to stop using strings everywhere and start using string views:

    auto const inDoc = buf.view();

But an even better option is to have a dedicated function for reading an entire file into a string (or, better, a vector<byte>, because it is binary data, not a string), because this is not a trivial task. I’ve written articles on how to do this, because it’s so complex. For now, it is enough just to create a dedicated read_file() function that takes a path or stream (or overloads with both), and then, at your leisure, you can optimize that (very useful, and very reusable) operation.

    /* Read more on Huffman coding for more info */
    HZip haz;
    frmap_t freqMap = haz.buildFrequencyTable(inDoc);
    NODE* root = haz.buildEncodingTree(freqMap);
    hmap_t encodedMap = haz.getEncodedMap(root);

    // data vector contains header and actual compressed data
    std::vector<std::string> data;

    haz.encodeData(inDoc, freqMap, encodedMap, data);

There is a lot in here that’s just… not good.

For starters, all that verbiage seems excessive. All you want is the encoding map, and the coded data. All you need is the raw data. So all you should have to write is:

    auto const [encodedMap, encodesData] = HZip::encode(inDoc);

That’s five error-prone lines compressed to one (almost) idiot-proof line. (It’s only almost idiot-proof because you could mix up the order of the outputs. Even that can be avoided by simply capturing the output object and calling .encodedMap and .data on it as needed.)

All that monkeying around to build the encoding tree should be internal to the HZip class. There is no need to subject outside users to that.

Indeed, HZip could just have a constructor that takes a std::span<std::byte>, and does all the work, then you just do this:

    auto const haz = HZip{inDoc};

    // If you actually need them:
    auto&& encodedMap = haz.encodedMap();
    auto&& encodedData = haz.encodedData();

Another sin in the code above is the use of “out” parameters. That is generally an anti-pattern. The arguments to a function should be inputs, and the outputs should be the return values. Thanks to move semantics, returning even expensive constructs like vectors of strings is pretty much free. So two-step code like this:

    std::vector<std::string> data;

    haz.encodeData(inDoc, freqMap, encodedMap, data);

… becomes:

    auto const data = haz.encodeData(inDoc, freqMap, encodedMap);

… which has multiple benefits:

  1. it’s shorter
  2. there is no way data can be accidentally uninitialized; and
  3. data can be const.

But the worst sin in this code is the use of owning raw pointers. If I were grading code, and I saw an owning raw pointer escape from an interface, that’s an immediate “F”; not even bothering to look any further, hard fail.

I mean, just look at the mess you have going on here. In decompress(), you call NODE* root = haz.buildEncodingTree(freqTable);, then later delete root;. Okay… but in compress() you call NODE* root = haz.buildEncodingTree(freqMap);, but then never call delete. Is that an error? It turns out: probably not… because in compress() you call haz.getEncodedMap(root), which, despite absolutely no indication in its interface, internally calls delete on the pointer. (And why does .getEncodedMap() delete the pointer you give it? That just makes no sense. It’s completely capricious. If I call database.get_fields(row), I would be horrified and furious if the function silently deleted the row out from under me.)

We’ll get back to all this stuff when we look at the interface of HZip. For now, let’s move on.

    /* Construct the path of header file
    *  header_file_path = dest_dir_path + "h@"+ source_filename + 
    *                     source_file_extension + huffman_extension
    */
    std::string header_file_path = hfile::getParentDicrectory(dest_file_path) + "h@" +
                                   hfile::getFileName(source_file_path) + 
                                   hfile::getSourceFileExtension(source_file_path) +
                                   extension;

    /* Write to header file */
    outHeaderFile.open(header_file_path, std::ios::binary);
    outHeaderFile << data[HEADER];

Now, I really don’t understand the point of forcing the user to create the file stream and pass it to the function. What is the thinking there? Do you want the user to have the file stream after the operation, for whatever reason? Then why not just return it?

Requiring the user to provide the file stream opens the door to a wholly unnecessary failure condition: what if the user creates the file stream, opens a file, and then passes it to compress()? Do you know what happens when you try to re-open an open file?

That problem vanishes, and the interface gets simpler, when you just don’t require the streams be passed as input.

An important note: all that path manipulation you are doing there is a really bad idea. Paths are confoundingly non-portable. And it can be EXTREMELY dangerous to try to “guess” what a “good” path might look like. If a user gives you a path and you “tweak” it some way, it may be harmless… or you may just wipe out some critical part of their operating system. If the user themselves gave you a path to be written to that, used unmodified and as given, damages their system, that’s on them… but if you fuck with the path and it destroys their system, that’s definitely on you. You should avoid fucking with paths completely if possible. You should generally never modify or create a path on your own, always take paths directly from the user and treat them as sacrosanct; use them exactly as given, without modification.

And especially, especially, especially, NEVER treat paths as raw strings. There is a lot that can go wrong; anything from encoding issues to god-only-knows-what. (I don’t do Windows, but I’ve heard of errors arising from the fact that Windows paths, by default, are not Unicode. Trying to use a UTF-8 encoded string as a path could lead to grief.) This:

    std::string header_file_path = hfile::getParentDicrectory(dest_file_path) + "h@" +
                                   hfile::getFileName(source_file_path) + 
                                   hfile::getSourceFileExtension(source_file_path) +
                                   extension;

… is a definite no-no.

Let’s just consider a piece of all that:

auto path = hfile::getParentDicrectory(input) + "h@";

Let’s see what happens with that.

As you can see, there are some unexpected results. In standard (POSIX) path notation foo and foo/ are both the same directory (assuming foo is a directory). But your code behaves differently between them. That could potentially be dangerous

And that’s even making the assumption that / is the path separator. Hello, Windows!

Don’t. Fuck. With. Paths.

Use the proper type for paths, std::filesystem::path, use only the functions provided by that type, and do only the absolute, bare minimum manipulation required—which must be clearly communicated to the user.

As a bonus, you can throw out all your path manipulation (which is wrong and dangerous, as I just demonstrated). No more need for:

  • hfile::getFileName()
  • getParentDicrectory() (spelling!)
  • getSourceFileExtension()

As for:

  • getOriginalFileExtension()
  • getSrarExtension()

… these can be handled by calling .stem() or .extension() multiple times (and checking the results!).

Lastly, before leaving this high-level view of things, I need to point out that you use std::string everywhere, for basically everything: paths, filenames, raw binary data. The whole point of C++, basically, is that it is strongly-typed. Using appropriate types goes a long way to not only avoiding bugs, but even to increasing efficiency.

For example, all paths should be std::filesystem::path. I think I’ve explained why clearly enough above. Pure filenames—not paths, but just the filename part—can be strings, but they must be made into a proper path before doing any filesystem operations. (But be very careful converting raw strings to paths! Strange things can happen if you are not careful!)

Raw binary data should also not be stored in strings. Strings are for character data: human-readable text. You should use std::vector<std::byte> (or std::array<std::byte, N> or std::span<std::byte, ???> as needed) for raw binary data. I realize that makes things a little less ergonomic when dealing with IOstreams… yes, we all know IOstreams are poorly designed. They are particularly horrible when dealing with binary I/O. That problem is being worked on. In the meantime, you just have to replace stuff like outHeaderFile << data[HEADER]; with outHeaderFile.write(static_cast<char const*>(data[HEADER].data()), data[HEADER].size());. For input, there are many options. The simplest (but not the most efficient) is to read everything into a string(stream), then into the proper container.

The HZip class (and friends)

std::string stobyte(std::string& sBits)

This function should really be an internal detail of HZip, so I am treating it as such. On the other hand, if you want to think of it as an external, generally useful utility function, that’s fine too.

Before anything else, this function really needs a better interface. There is no reason to require an actual string as input; you can use a string view. And there is really, really no reason for it to be taking a non-const reference.

As for the implementation:

std::string stobyte(std::string& sBits)
{
    std::string sBytes;

Okay, so before starting anything you already know how many bytes you’re going to get. It will be the number of characters in sBits divided by 8, and if there is any remainder, an extra byte. Since you already know that, you can reserve that memory in the output, to avoid repeated reallocations as the it grows:

constexpr auto stobyte(std::string_view sBits) -> std::vector<std::byte>
{
    // Number of bits in a byte. You could static assert this actually
    // matches the value in numeric_limits<unsigned char>::digits.
    constexpr auto bit_width = 8uz;

    auto const number_of_bytes = sBits / bit_width;
    auto const leftover_bits = sBits % bit_width;

    auto sBytes = std::vector<std::byte>();
    sBytes.reserve(number_of_bytes + ((leftover_bits == 0) ? 0uz : 1uz));

Instead of .reserve(), you could also use .resize(), since you know you’re going to be filling the whole reserved amount anyway. The cost is that the vector’s data will be zeroed first… but the benefit is that we can avoid any the size-changing bookkeeping later. But put a pin in all of this, because we may just make it all go away later.

    for(std::size_t i = 0; i < sBits.size() + numberOfBits; ++i) {

Writing raw loops in modern C++ is an anti-pattern. In particular, this is not a very well-written loop. The static variable hidden within is particularly problematic, because it means if anyone tries running this function concurrently… who knows what the hell results you’ll get. You also use a potentially-signed char for bit manipulation. Not wise. And this line:

        (sBits[i] == '0') ? (bit = 0) : (bit = 1);

… is just a crime against humanity.

The modern way is to use ranges. In this case, you want to take the input string, chunk it into 8-character chunks, and then transform each 8-character chunk into a byte. The last chunk might be less than 8 characters, in which case we zero-pad.

So:

auto sBytes = sBits
    | std::views::chunk(bit_width)
    | std::views::transform(chunk_to_byte)
    | std::ranges::to<std::vector>()
;

Where:

auto const chunk_to_byte = [](auto&& bits)
{
    auto byte = std::byte{};

    std::ranges::for_each(bits,
        [&byte](auto bit)
        {
            byte << 1;
            byte |= (bit == '1') ? 1uz : 0uz;
        }
    );

    // Zero pad if necessary.
    if (std::ranges::size(bits) < bit_width)
        byte << (bit_width - std::ranges::size(bits);

    return byte;
};

Put altogether:

// stobyte is still a terrible name! Why not bit_string_to_bytes?
constexpr auto stobyte(std::string_view sBits) -> std::vector<std::byte>
{
    constexpr auto bit_width = 8uz;

    auto const chunk_to_byte = [](auto&& bits)
    {
        auto byte = std::byte{};

        std::ranges::for_each(bits,
            [&byte](auto bit)
            {
                byte << 1;
                byte |= (bit == '1') ? 1uz : 0uz;
            }
        );

        // Zero pad if necessary.
        if (std::ranges::size(bits) < bit_width)
            byte << (bit_width - std::ranges::size(bits);

        return byte;
    };

    return sBits
        | std::views::chunk(bit_width)
        | std::views::transform(chunk_to_byte)
        | std::ranges::to<std::vector>()
    ;
}

Note that the memory reservation is done automatically, because the source range (sBits) knows its size, and both views::chunk and views::transform can propagate that information. So this code is not only significantly shorter… and safer… it’s also much more efficient than the original code.

But do note there is no error checking in any of the above, which may be fine for an internal detail function (because the checks should have been done beforehand), but is definitely not the case for a general utility function. Adding error checking is not hard:

constexpr auto stobyte(std::string_view sBits) -> std::vector<std::byte>
{
    if (not std::ranges::all_of(sBits, [](auto bit) { return bit == '0' or bit == '1'; }))
        throw some_error{};

    // Rest of function:
    constexpr auto bit_width = 8uz;

    return sBits
        | std::views::chunk(bit_width)
        | std::views::transform([](auto&& bits)
            {
                auto byte = std::byte{};
                std::ranges::for_each(bits, [&byte](auto bit) { byte << 1; byte |= (bit == '1') ? 1uz : 0uz; });

                if (std::ranges::size(bits) < bit_width)
                    byte << (bit_width - std::ranges::size(bits);

                return byte;
            })
        | std::ranges::to<std::vector>()
    ;
}

Next up is:

struct NODE
{
    hchar_t _character;
    hfreq_t _freq;

    NODE* _lChild;
    NODE* _rChild;

    NODE(hchar_t character, hfreq_t freq):
        _character(character), _freq(freq),
        _lChild(nullptr), _rChild(nullptr) {}

    ~NODE() {
        delete _lChild;
        delete _rChild;
    }
};

There are a lot of things going wrong here.

For starters, the name. More on that later.

The node owns two child nodes… as raw pointers. That’s another no-no.

And because of the above, you obviously need a destructor to clean up… except now you’re violating the rule of five.

And finally, this class should be a member of HZip, because it makes no sense on its own.

Let’s clean this up.

First, we put the class inside of `HZip. Then we use smart pointers. That makes the destructor pointless, and when we drop it, no we no longer violate the rule of five:

class HZip
{
    using hchar_t = unsigned short; 
    using hfreq_t = unsigned int;

    struct node_t
    {
        hchar_t _character;
        hfreq_t _freq;

        std::unique_ptr<node_t> _lChild;
        std::unique_ptr<node_t> _rChild;

        node_t(hchar_t character, hfreq_t freq):
            _character(character), _freq(freq),
            _lChild(nullptr), _rChild(nullptr) {}
    };

There is no need to initialize the two smart pointers anymore; they default-initialize correctly. So you can do this:

class HZip
{
    using hchar_t = unsigned short; 
    using hfreq_t = unsigned int;

    struct node_t
    {
        hchar_t _character;
        hfreq_t _freq;

        std::unique_ptr<node_t> _lChild;
        std::unique_ptr<node_t> _rChild;

        node_t(hchar_t character, hfreq_t freq):
            _character(character), _freq(freq)
        {}
    };

And in fact, since everything is public, there is really no need for the constructor at all. Regular aggregate initialization will work:

class HZip
{
    using hchar_t = unsigned short; 
    using hfreq_t = unsigned int;

    struct node_t
    {
        hchar_t _character;
        hfreq_t _freq;

        std::unique_ptr<node_t> _lChild;
        std::unique_ptr<node_t> _rChild;
    };

You can also use member initializers, for clarity and safety:

class HZip
{
    using hchar_t = unsigned short; 
    using hfreq_t = unsigned int;

    struct node_t
    {
        hchar_t _character = {};
        hfreq_t _freq = {}; // or maybe should default to 1?

        std::unique_ptr<node_t> _lChild = {};
        std::unique_ptr<node_t> _rChild = {};
    };

Now the comparator should not only be a member of HZip, it should be a member of the node. Also, the comparison operator should be both const, and ideally, noexcept:

class HZip
{
    using hchar_t = unsigned short; 
    using hfreq_t = unsigned int;

    struct node_t
    {
        struct comparator
        {
            constexpr auto operator<(std::unique_ptr<node_t> const& lChild, std::unique_ptr<node_t> const& rChild) const noexcept
            {
                return lChild->_freq > rChild->_freq;
            }
        };

        hchar_t _character = {};
        hfreq_t _freq = {}; // or maybe should default to 1?

        std::unique_ptr<node_t> _lChild = {};
        std::unique_ptr<node_t> _rChild = {};
    };

And now we get to the beast: HZip.

Let’s get the small problems out of the way first.

First, if the default constructor does nothing, you should not do this:

HZip::HZip()
{}

You should do this:

HZip::HZip() = default;

But if the default constructor does nothing and there are no other constructors… then why bother declaring it at all?

Next: protected. Avoid. It serves literally no purpose here, because HZip cannot be safely inherited from (it has no virtual destructor).

Now, let’s get to the meat of the problem. HZip is bloated. It does too much. It contains too much.

Let’s start with the frequency table. That does not need to be contained within HZip (or, if it is, it should be as a member type).

Why? Well, consider this. Right now, your frequency table is a std::map<unsigned short, unsigned int>. Is that the best choice? Probably not. std::map is powerfully heavyweight. And you don’t really need it. You are just mapping a fairly small number of values. What is it, 258 total? There are probably significantly better options, like std::vector<std::tuple<unsigned short, unsigned int>>, or std::flat_map<unsigned short, unsigned int> or maybe even std::array<unsigned int, 258>.

Whatever it is, it should be wrapped in a sensible interface, so the internal type can be swapped out at will.

The sensible interface just needs to look somewhat like std::map (or its siblings) though it does not need to be able to add/remove arbitrary key-values or any of that other stuff. And it needs a way to give it some data so it can compute the frequencies, and maybe adjust/reset frequencies.

With the frequency table as its own type, generating it can be as simple as:

auto const ft = frequency_table{data};

… which uses a constructor like this:

frequency_table::frequency_table(std::span<std::byte> bytes)
{
    // Let's assume the internal data type is:
    // std::array<unsigned short, 258> _table = {};

    std::ranges::for_each(bytes, [](auto b) { ++_table[to_integer<unsigned int>(b)]; });

    _table[256] = 1;
}

… which is basically your buildFrequencyTable() function, encapsulated so that you can change the frequency table internal type whenever you please.

Aside from the flexibility of being able to change the internal representation, the main reason you want the frequency table to be its own type is because it both generates and is generated from the header data, which can be separated from the actual data (compressed or uncompressed).

Next is the Huffman tree. The constructor for this could take a frequency table, and generate the tree.

Once again, the issue boils down to “what is the best internal type for a Huffman tree”. You are using (basically) a tree of separately-allocated NODE*. Is that the best choice? Maybe. Or, once again, maybe because you know the total number of elements is limited, some other type might be better. That is why you put everything in an encapsulated class, so you can tweak it later without changing the rest of the code.

And finally, the actual Huffman code map. Now this probably doesn’t need to be its own type, because it really is the heart and soul of HZip. It is probably the only data member that HZip needs.

Side note: is there really a purpose to m_minHeap being a data member of HZip?

But even for the Huffman code map, you have to ask: what if I want to change the internal representation? Right now you are using std::map<unsigned short, std::string>, which is… yikes. Once again, you have a small, fixed set of keys. And is a string of ones and zeros really the best way to represent a sequence of bits?

Or maybe there’s a different way of looking at it: maybe the tree should be the only thing in a HZip object, and the code mapping would be generated on the fly whenever encoding is done, and discarded afterwards.

What I’m getting at is that right now, HZip is a mess. It’s interface is convoluted, and it’s exposing its internal guts all over the place. It seems to me what you really want is a class that takes a frequency table and generates the Huffman tree, either encode or decode. So basically, an interface something like this:

class huffman
{
public:
    huffman(frequency_table const&); // generates tree

    auto encode(std::span<std::byte>, std::ostream&);
    auto encode(std::istream&, std::ostream&);

    auto decode(std::span<std::byte>, std::ostream&);
    auto decode(std::istream&, std::ostream&);
};

So now your compress() function would look more like:

auto compress(std::istream& src, std::ostream& header, std::ostream& data)
{
    auto const src_data = read_stream_data(src);

    auto const ft = frequency_table{src_data};

    // Write the header.
    generate_header(ft, header);

    // Write the data.
    auto huf = huffman{ft};
    huf.encode(src_data, data);
}

Note: all the internal Huffman encoding shenanigans are now completely encapsulated. compress() doesn’t need to know anything about trees or code maps or any other crap. The only reason it needs to be even this complex is because we need the frequency table data twice: once for the header, and once for the encoding. If all we were doing was encoding, it could just be:

auto compress(std::istream& src, std::ostream& data)
{
    
    auto const src_data = read_stream_data(src);

    huffman{frequency_table{src_data}}.encode(src_data, data);
}

And similarly for decompress:

auto decompress(std::istream& header, std::istream& data, std::ostream& out)
{
    auto const ft = decode_header(header);

    huffman{ft}.decode(data, out);
}

The point is that all the internal details of Huffman encoding should be just that: internal details. Don’t let them spill out of the class. Don’t force outside users to have to know they have to generate maps, tables, and trees. Even the frequency table could be an internal detail… except you specifically want it to go into a separate header file.

Header file issues

You have a lot of stuff in your header files that really shouldn’t be there, among other problems. Let’s start at the top.

#ifndef _HZIP_HPP
#define _HZIP_HPP

Or:

#ifndef _HFILE_HPP_
#define _HFILE_HPP_

These are illegal identifiers. The rules for identifiers are a little complex, so a lot of people just say “don’t start identifiers with underscores”. That’s technically too broad, but in this case, starting an identifier with an underscore followed by a capital letter is always illegal.

typedef unsigned short hchar_t; 

This is technically not illegal in standard C++, but I believe POSIX bans top-level identifiers that end in _t (or possibly just top-level type aliases).

In any case, all your stuff should be in a namespace. Don’t pollute the global namespace.

const hchar_t PSEUDO_EOF = 256; // A virtual end of file 
const hchar_t NOT_A_CHAR = 257; // Write to internal nodes
const bit_t MAX_NUMBER_OF_BITS = 8; 

And also:

const unsigned char HEADER = 0;
const unsigned char ACTUAL_COMPRESSED_DATA = 1;

Do not use SCREAMING_SNAKE_CASE. It is conventionally reserved for macros. (See also NL.9.)

This also applies to NODE.

typedef unsigned int hfreq_t;

Don’t try to avoid negative values by using unsigned.

namespace h_extension {
    const std::string hzip = ".hzip";
    const std::string har = ".har";
}

This is not a good way to make global constants. What happens is that every translation unit will get its own copy of these strings.

What you want is to use inline:

namespace h_extension {
    inline const std::string hzip = ".hzip";
    inline const std::string har = ".har";
}

But even better would be to use constexpr if you can, because constexpr is implicitly inline (and const), along with having other benefits. Unfortunately, you can”t do this with string constants for complicated reasons… but you don’t really need string constants in any case. You can just use string views:

namespace h_extension {
    constexpr std::string_view hzip = ".hzip";
    constexpr std::string_view har = ".har";
}

Finally, as mentioned previously, you have a lot of stuff loose in your headers that just doesn’t need to be there. Some of it would be better encapsulated within a class. For example, all of this:

// Data type for a Huffman char
typedef unsigned short hchar_t; 

// Data type for frequency 
typedef unsigned int hfreq_t;

// Data type for frequency table
typedef std::map<hchar_t, hfreq_t> frmap_t; 

// Data type for huffman code map
typedef std::map<hchar_t, std::string> hmap_t; 

// Data type for a bit
typedef char bit_t;

const hchar_t PSEUDO_EOF = 256; // A virtual end of file 
const hchar_t NOT_A_CHAR = 257; // Write to internal nodes
const bit_t MAX_NUMBER_OF_BITS = 8; 

/* Convert string of bits ( "0" or "1") 
*  to real byte 
*  Return converted bytes in the form of strings
*/
std::string stobyte(std::string& sBits);

/*
* A Huffman node contains a character, its frequency
* and 2 children node
*/
struct NODE
{
    hchar_t _character;
    hfreq_t _freq;

    NODE* _lChild;
    NODE* _rChild;

    NODE(hchar_t character, hfreq_t freq):
        _character(character), _freq(freq),
        _lChild(nullptr), _rChild(nullptr) {}

    ~NODE() {
        delete _lChild;
        delete _rChild;
    }
};

// ADT for min heap implementation
struct Comparator
{
    bool operator()(NODE* lChild, NODE* rChild) 
    {
        return lChild->_freq > rChild->_freq;
    }
};

… should be inside of HZip.

And these functions:

/*
*  Namespace for path manipulation
*  ----------------------------------------
*  Can be used for constructing the path of 
*  compressed and decompressed file
*/
namespace hfile {
    std::string getFileName(const std::string& path_to_file);
    std::string getParentDicrectory(const std::string& path_to_file);

    /* Get the extension of file need to be compressed
    *  Example: raw file: test.txt -> .txt
    *  --------------------------------------------------
    *  Used for constructing the path to compressed file
    */
    std::string getSourceFileExtension(const std::string& path_to_file);

    /* Get the extension of original file 
    *  Example: compressed file: test.txt.har -> txt
    *  ---------------------------------------------------
    * Used for constructing the path to decompressed file
    */ 
    std::string getOriginalFileExtension(const std::string& path_to_file);

    std::string getSrarExtension(const std::string& path_to_file);
}

… are just implementation details, and so shouldn’t be in the header at all.

\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.