From 39a11ec88952b077be88b677f08db9dd91a83ac8 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 26 May 2026 08:58:27 +0000 Subject: [PATCH] Fix Huffman edge cases and add CMake build with tests - Handle empty and single-symbol inputs; assign code "0" for one symbol - Use unsigned byte reads, validate non-ASCII input, and fix last-byte padding trim - Add destructor, I/O error handling, and safer decode with null checks - Add CMakeLists.txt, CTest roundtrip script, and README build instructions Co-authored-by: Xinyu Fu --- .gitignore | 1 + CMakeLists.txt | 21 ++ README.md | 17 ++ huffman.cpp | 623 +++++++++++++++++++++++++++----------------- huffman.h | 55 ++-- huffmanCoding.cpp | 29 ++- huffmanDecoding.cpp | 21 +- tests/run_tests.sh | 62 +++++ 8 files changed, 548 insertions(+), 281 deletions(-) create mode 100644 CMakeLists.txt create mode 100755 tests/run_tests.sh diff --git a/.gitignore b/.gitignore index 259148f..de7f0f3 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ *.exe *.out *.app +build/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..de7da9f --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.10) +project(HuffmanCoding LANGUAGES CXX) + +if(NOT CMAKE_CXX_COMPILER) + set(CMAKE_CXX_COMPILER g++) +endif() + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +add_executable(huffmanCoding huffmanCoding.cpp huffman.cpp) +add_executable(huffmanDecoding huffmanDecoding.cpp huffman.cpp) + +enable_testing() + +set(HUFFMAN_TEST_SCRIPT ${CMAKE_SOURCE_DIR}/tests/run_tests.sh) +add_test( + NAME huffman_roundtrip + COMMAND bash ${HUFFMAN_TEST_SCRIPT} $ $ +) diff --git a/README.md b/README.md index 1d00e13..7ee4ae3 100644 --- a/README.md +++ b/README.md @@ -21,3 +21,20 @@ The programs can compress and decompress text file consisting of 128 ASCII chara 1. Open file, recreate Huffman Tree based on Huffman Encode Table; 2. Decode the file based on the tree and store it to the desired file. + +## Build and test + +```bash +cmake -S . -B build -DCMAKE_CXX_COMPILER=g++ +cmake --build build +ctest --test-dir build --output-on-failure +``` + +Run manually: + +```bash +./build/huffmanCoding input.txt output.huf +./build/huffmanDecoding output.huf restored.txt +``` + +Input is limited to 7-bit ASCII bytes (0–127). Empty files and bytes >= 128 are rejected with an error message. diff --git a/huffman.cpp b/huffman.cpp index 00548e7..4eb2044 100644 --- a/huffman.cpp +++ b/huffman.cpp @@ -1,237 +1,386 @@ -#include "huffman.h" - -void huffman::create_node_array() -{ - for (int i = 0; i < 128; i++) - { - node_array[i] = new huffman_node; - node_array[i]->id = i; - node_array[i]->freq = 0; - } -} - -void huffman::traverse(node_ptr node, string code) -{ - if (node->left == NULL && node->right == NULL) - { - node->code = code; - } - else - { - traverse(node->left, code + '0'); - traverse(node->right, code + '1'); - } -} - -int huffman::binary_to_decimal(string& in) -{ - int result = 0; - for (int i = 0; i < in.size(); i++) - result = result * 2 + in[i] - '0'; - return result; -} - -string huffman::decimal_to_binary(int in) -{ - string temp = ""; - string result = ""; - while (in) - { - temp += ('0' + in % 2); - in /= 2; - } - result.append(8 - temp.size(), '0'); //append '0' ahead to let the result become fixed length of 8 - for (int i = temp.size() - 1; i >= 0; i--) - { - result += temp[i]; - } - return result; -} - -inline void huffman::build_tree(string& path, char a_code) -{//build a new branch according to the inpue code and ignore the already existed branches - node_ptr current = root; - for (int i = 0; i < path.size(); i++) - { - if (path[i] == '0') - { - if (current->left == NULL) - current->left = new huffman_node; - current = current->left; - } - else if (path[i] == '1') - { - if (current->right == NULL) - current->right = new huffman_node; - current = current->right; - } - } - current->id = a_code; //attach id to the leaf -} - -huffman::huffman(string in, string out) -{ - in_file_name = in; - out_file_name = out; - create_node_array(); -} -void huffman::create_pq() -{ - in_file.open(in_file_name, ios::in); - in_file.get(id); - while (!in_file.eof()) - { - node_array[id]->freq++; - in_file.get(id); - } - in_file.close(); - for (int i = 0; i < 128; i++) - { - if (node_array[i]->freq) - { - pq.push(node_array[i]); - } - } -} - -void huffman::create_huffman_tree() -{ - priority_queue, compare> temp(pq); - while (temp.size() > 1) - {//create the huffman tree with highest frequecy characher being leaf from bottom to top - root = new huffman_node; - root->freq = 0; - root->left = temp.top(); - root->freq += temp.top()->freq; - temp.pop(); - root->right = temp.top(); - root->freq += temp.top()->freq; - temp.pop(); - temp.push(root); - } -} - -void huffman::calculate_huffman_codes() -{ - traverse(root, ""); -} - -void huffman::coding_save() -{ - in_file.open(in_file_name, ios::in); - out_file.open(out_file_name, ios::out | ios::binary); - string in = "", s = ""; - - in += (char)pq.size(); //the first byte saves the size of the priority queue - priority_queue, compare> temp(pq); - while (!temp.empty()) - {//get all characters and their huffman codes for output - node_ptr current = temp.top(); - in += current->id; - s.assign(127 - current->code.size(), '0'); //set the codes with a fixed 128-bit string form[000¡­¡­1 + real code] - s += '1'; //'1' indicates the start of huffman code - s.append(current->code); - in += (char)binary_to_decimal(s.substr(0, 8)); - for (int i = 0; i < 15; i++) - {//cut into 8-bit binary codes that can convert into saving char needed for binary file - s = s.substr(8); - in += (char)binary_to_decimal(s.substr(0, 8)); - } - temp.pop(); - } - s.clear(); - - in_file.get(id); - while (!in_file.eof()) - {//get the huffman code - s += node_array[id]->code; - while (s.size() > 8) - {//cut into 8-bit binary codes that can convert into saving char needed for binary file - in += (char)binary_to_decimal(s.substr(0, 8)); - s = s.substr(8); - } - in_file.get(id); - } - int count = 8 - s.size(); - if (s.size() < 8) - {//append number of 'count' '0' to the last few codes to create the last byte of text - s.append(count, '0'); - } - in += (char)binary_to_decimal(s); //save number of 'count' at last - in += (char)count; - - out_file.write(in.c_str(), in.size()); - in_file.close(); - out_file.close(); -} - -void huffman::recreate_huffman_tree() -{ - in_file.open(in_file_name, ios::in | ios::binary); - unsigned char size; //unsigned char to get number of node of humman tree - in_file.read(reinterpret_cast(&size), 1); - root = new huffman_node; - for (int i = 0; i < size; i++) - { - char a_code; - unsigned char h_code_c[16]; //16 unsigned char to obtain the binary code - in_file.read(&a_code, 1); - in_file.read(reinterpret_cast(h_code_c), 16); - string h_code_s = ""; - for (int i = 0; i < 16; i++) - {//obtain the oringinal 128-bit binary string - h_code_s += decimal_to_binary(h_code_c[i]); - } - int j = 0; - while (h_code_s[j] == '0') - {//delete the added '000¡­¡­1' to get the real huffman code - j++; - } - h_code_s = h_code_s.substr(j + 1); - build_tree(h_code_s, a_code); - } - in_file.close(); -} - -void huffman::decoding_save() -{ - in_file.open(in_file_name, ios::in | ios::binary); - out_file.open(out_file_name, ios::out); - unsigned char size; //get the size of huffman tree - in_file.read(reinterpret_cast(&size), 1); - in_file.seekg(-1, ios::end); //jump to the last one byte to get the number of '0' append to the string at last - char count0; - in_file.read(&count0, 1); - in_file.seekg(1 + 17 * size, ios::beg); //jump to the position where text starts - - vector text; - unsigned char textseg; - in_file.read(reinterpret_cast(&textseg), 1); - while (!in_file.eof()) - {//get the text byte by byte using unsigned char - text.push_back(textseg); - in_file.read(reinterpret_cast(&textseg), 1); - } - node_ptr current = root; - string path; - for (int i = 0; i < text.size() - 1; i++) - {//translate the huffman code - path = decimal_to_binary(text[i]); - if (i == text.size() - 2) - path = path.substr(0, 8 - count0); - for (int j = 0; j < path.size(); j++) - { - if (path[j] == '0') - current = current->left; - else - current = current->right; - if (current->left == NULL && current->right == NULL) - { - out_file.put(current->id); - current = root; - } - } - } - in_file.close(); - out_file.close(); -} +#include "huffman.h" + +#include + +void huffman::create_node_array() +{ + for (int i = 0; i < 128; i++) + { + node_array[i] = new huffman_node; + node_array[i]->id = static_cast(i); + node_array[i]->freq = 0; + } +} + +void huffman::destroy_tree(node_ptr node) +{ + if (!node) + return; + destroy_tree(node->left); + destroy_tree(node->right); + delete node; +} + +huffman::huffman(const std::string& in, const std::string& out) + : root(nullptr), in_file_name(in), out_file_name(out) +{ + create_node_array(); +} + +huffman::~huffman() +{ + if (root) + { + destroy_tree(root); + for (int i = 0; i < 128; i++) + { + if (!node_array[i]->freq) + delete node_array[i]; + } + } + else + { + for (int i = 0; i < 128; i++) + delete node_array[i]; + } +} + +void huffman::traverse(node_ptr node, const std::string& code) +{ + if (!node) + return; + if (node->left == nullptr && node->right == nullptr) + node->code = code; + else + { + traverse(node->left, code + '0'); + traverse(node->right, code + '1'); + } +} + +int huffman::binary_to_decimal(const std::string& in) +{ + int result = 0; + for (size_t i = 0; i < in.size(); i++) + result = result * 2 + (in[i] - '0'); + return result; +} + +std::string huffman::decimal_to_binary(int in) +{ + std::string temp; + std::string result; + while (in) + { + temp += static_cast('0' + in % 2); + in /= 2; + } + result.append(8 - temp.size(), '0'); + for (int i = static_cast(temp.size()) - 1; i >= 0; i--) + result += temp[static_cast(i)]; + return result; +} + +void huffman::build_tree(const std::string& path, unsigned char a_code) +{ + node_ptr current = root; + for (size_t i = 0; i < path.size(); i++) + { + if (path[i] == '0') + { + if (current->left == nullptr) + current->left = new huffman_node; + current = current->left; + } + else if (path[i] == '1') + { + if (current->right == nullptr) + current->right = new huffman_node; + current = current->right; + } + } + current->id = a_code; +} + +bool huffman::create_pq() +{ + in_file.open(in_file_name.c_str(), std::ios::in | std::ios::binary); + if (!in_file) + { + std::cerr << "Error: cannot open input file: " << in_file_name << std::endl; + return false; + } + + unsigned char ch = 0; + while (in_file.read(reinterpret_cast(&ch), 1)) + { + if (ch >= 128) + { + std::cerr << "Error: input contains non-ASCII byte (>= 128): 0x" + << std::hex << static_cast(ch) << std::dec << std::endl; + in_file.close(); + return false; + } + node_array[ch]->freq++; + } + + in_file.close(); + + for (int i = 0; i < 128; i++) + { + if (node_array[i]->freq) + pq.push(node_array[i]); + } + return true; +} + +bool huffman::create_huffman_tree() +{ + if (pq.empty()) + return false; + + std::priority_queue, compare> temp(pq); + if (temp.size() == 1) + { + root = temp.top(); + return true; + } + + while (temp.size() > 1) + { + root = new huffman_node; + root->freq = 0; + root->left = temp.top(); + root->freq += temp.top()->freq; + temp.pop(); + root->right = temp.top(); + root->freq += temp.top()->freq; + temp.pop(); + temp.push(root); + } + return true; +} + +void huffman::calculate_huffman_codes() +{ + if (pq.size() == 1) + { + root->code = "0"; + return; + } + traverse(root, ""); +} + +bool huffman::coding_save() +{ + in_file.open(in_file_name.c_str(), std::ios::in | std::ios::binary); + if (!in_file) + { + std::cerr << "Error: cannot open input file: " << in_file_name << std::endl; + return false; + } + + out_file.open(out_file_name.c_str(), std::ios::out | std::ios::binary); + if (!out_file) + { + std::cerr << "Error: cannot open output file: " << out_file_name << std::endl; + in_file.close(); + return false; + } + + std::string in; + std::string s; + const unsigned char symbol_count = static_cast(pq.size()); + in += static_cast(symbol_count); + + std::priority_queue, compare> temp(pq); + while (!temp.empty()) + { + node_ptr current = temp.top(); + in += static_cast(current->id); + s.assign(127 - current->code.size(), '0'); + s += '1'; + s.append(current->code); + in += static_cast(binary_to_decimal(s.substr(0, 8))); + for (int i = 0; i < 15; i++) + { + s = s.substr(8); + in += static_cast(binary_to_decimal(s.substr(0, 8))); + } + temp.pop(); + } + s.clear(); + + unsigned char ch = 0; + while (in_file.read(reinterpret_cast(&ch), 1)) + { + s += node_array[ch]->code; + while (s.size() > 8) + { + in += static_cast(binary_to_decimal(s.substr(0, 8))); + s = s.substr(8); + } + } + + const int count = static_cast(8 - s.size()); + if (s.size() < 8) + s.append(static_cast(count), '0'); + in += static_cast(binary_to_decimal(s)); + in += static_cast(count); + + out_file.write(in.c_str(), static_cast(in.size())); + in_file.close(); + out_file.close(); + return true; +} + +bool huffman::recreate_huffman_tree() +{ + in_file.open(in_file_name.c_str(), std::ios::in | std::ios::binary); + if (!in_file) + { + std::cerr << "Error: cannot open input file: " << in_file_name << std::endl; + return false; + } + + unsigned char size = 0; + if (!in_file.read(reinterpret_cast(&size), 1)) + { + std::cerr << "Error: compressed file is empty or truncated." << std::endl; + in_file.close(); + return false; + } + + if (root) + { + destroy_tree(root); + root = nullptr; + } + root = new huffman_node; + + for (int i = 0; i < size; i++) + { + unsigned char a_code = 0; + unsigned char h_code_c[16]; + if (!in_file.read(reinterpret_cast(&a_code), 1) || + !in_file.read(reinterpret_cast(h_code_c), 16)) + { + std::cerr << "Error: truncated code table in compressed file." << std::endl; + in_file.close(); + return false; + } + + std::string h_code_s; + for (int k = 0; k < 16; k++) + h_code_s += decimal_to_binary(h_code_c[k]); + + size_t j = 0; + while (j < h_code_s.size() && h_code_s[j] == '0') + j++; + if (j < h_code_s.size()) + h_code_s = h_code_s.substr(j + 1); + else + h_code_s.clear(); + + build_tree(h_code_s, a_code); + } + + in_file.close(); + return true; +} + +bool huffman::decoding_save() +{ + in_file.open(in_file_name.c_str(), std::ios::in | std::ios::binary); + if (!in_file) + { + std::cerr << "Error: cannot open input file: " << in_file_name << std::endl; + return false; + } + + out_file.open(out_file_name.c_str(), std::ios::out | std::ios::binary); + if (!out_file) + { + std::cerr << "Error: cannot open output file: " << out_file_name << std::endl; + in_file.close(); + return false; + } + + unsigned char size = 0; + if (!in_file.read(reinterpret_cast(&size), 1)) + { + std::cerr << "Error: compressed file is empty or truncated." << std::endl; + in_file.close(); + out_file.close(); + return false; + } + + in_file.seekg(-1, std::ios::end); + unsigned char count0 = 0; + if (!in_file.read(reinterpret_cast(&count0), 1)) + { + std::cerr << "Error: compressed file is truncated." << std::endl; + in_file.close(); + out_file.close(); + return false; + } + + in_file.seekg(static_cast(1 + 17 * size), std::ios::beg); + + std::vector text; + unsigned char textseg = 0; + while (in_file.read(reinterpret_cast(&textseg), 1)) + text.push_back(textseg); + + if (text.empty()) + { + std::cerr << "Error: no encoded data in compressed file." << std::endl; + in_file.close(); + out_file.close(); + return false; + } + + node_ptr current = root; + for (size_t i = 0; i + 1 < text.size(); i++) + { + std::string path = decimal_to_binary(text[i]); + if (i == text.size() - 2) + path = path.substr(0, 8 - count0); + + for (size_t j = 0; j < path.size(); j++) + { + if (path[j] == '0') + { + if (!current->left) + { + std::cerr << "Error: invalid Huffman code in compressed data." << std::endl; + in_file.close(); + out_file.close(); + return false; + } + current = current->left; + } + else + { + if (!current->right) + { + std::cerr << "Error: invalid Huffman code in compressed data." << std::endl; + in_file.close(); + out_file.close(); + return false; + } + current = current->right; + } + + if (current->left == nullptr && current->right == nullptr) + { + out_file.put(static_cast(current->id)); + current = root; + } + } + } + + in_file.close(); + out_file.close(); + return true; +} diff --git a/huffman.h b/huffman.h index 6361140..d953d39 100644 --- a/huffman.h +++ b/huffman.h @@ -1,21 +1,21 @@ #ifndef HUFFMAN_H #define HUFFMAN_H -#include + +#include #include +#include #include -#include -using namespace std; struct huffman_node { - char id; //character - int freq; //frequency of the character - string code; //huffman code for the character + unsigned char id; + int freq; + std::string code; huffman_node* left; huffman_node* right; huffman_node() - {//constructer - left = right = NULL; + { + left = right = nullptr; } }; typedef huffman_node* node_ptr; @@ -23,34 +23,37 @@ typedef huffman_node* node_ptr; class huffman { protected: - node_ptr node_array[128]; //array for 128characters in the Ascii Table - fstream in_file, out_file; - node_ptr child, parent, root; - char id; - string in_file_name, out_file_name; + node_ptr node_array[128]; + std::fstream in_file, out_file; + node_ptr root; + std::string in_file_name, out_file_name; + class compare - {//a object funtion to set comparing rule of priority queue + { public: bool operator()(const node_ptr& c1, const node_ptr& c2) const { return c1->freq > c2->freq; } }; - priority_queue, compare> pq; //priority queue of frequency from high to low - void create_node_array(); - void traverse(node_ptr, string); //traverse the huffman tree and get huffman code for a character - int binary_to_decimal(string&); //convert a 8-bit 0/1 string of binary code to a decimal integer - string decimal_to_binary(int); //convert a decimal integer to a 8-bit 0/1 string of binary code - inline void build_tree(string&, char); //build the huffman tree according to information from file + std::priority_queue, compare> pq; + + void create_node_array(); + void destroy_tree(node_ptr node); + void traverse(node_ptr node, const std::string& code); + int binary_to_decimal(const std::string& in); + std::string decimal_to_binary(int in); + void build_tree(const std::string& path, unsigned char a_code); public: - huffman(string, string); - void create_pq(); - void create_huffman_tree(); + huffman(const std::string& in, const std::string& out); + ~huffman(); + bool create_pq(); + bool create_huffman_tree(); void calculate_huffman_codes(); - void coding_save(); - void decoding_save(); - void recreate_huffman_tree(); + bool coding_save(); + bool recreate_huffman_tree(); + bool decoding_save(); }; #endif diff --git a/huffmanCoding.cpp b/huffmanCoding.cpp index 0d53859..d04e5ba 100644 --- a/huffmanCoding.cpp +++ b/huffmanCoding.cpp @@ -1,19 +1,28 @@ +#include #include + #include "huffman.h" -using namespace std; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { if (argc != 3) { - cout << "Usage:\n\t huffmanCoding inputfile outputfile" << endl; - exit(1); + std::cout << "Usage:\n\t huffmanCoding inputfile outputfile" << std::endl; + return EXIT_FAILURE; } + huffman h(argv[1], argv[2]); - h.create_pq(); - h.create_huffman_tree(); + if (!h.create_pq()) + return EXIT_FAILURE; + if (!h.create_huffman_tree()) + { + std::cerr << "Error: input file is empty." << std::endl; + return EXIT_FAILURE; + } h.calculate_huffman_codes(); - h.coding_save(); - cout << endl; - return 0; -} \ No newline at end of file + if (!h.coding_save()) + return EXIT_FAILURE; + + std::cout << "Compression finished." << std::endl; + return EXIT_SUCCESS; +} diff --git a/huffmanDecoding.cpp b/huffmanDecoding.cpp index c539805..e3010d6 100644 --- a/huffmanDecoding.cpp +++ b/huffmanDecoding.cpp @@ -1,17 +1,22 @@ +#include #include + #include "huffman.h" -using namespace std; -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { if (argc != 3) { - cout << "Usage:\n\t huffmanDecoding inputfile outputfile" << endl; - exit(1); + std::cout << "Usage:\n\t huffmanDecoding inputfile outputfile" << std::endl; + return EXIT_FAILURE; } + huffman h(argv[1], argv[2]); - h.recreate_huffman_tree(); - h.decoding_save(); - cout << endl; - return 0; + if (!h.recreate_huffman_tree()) + return EXIT_FAILURE; + if (!h.decoding_save()) + return EXIT_FAILURE; + + std::cout << "Decompression finished." << std::endl; + return EXIT_SUCCESS; } diff --git a/tests/run_tests.sh b/tests/run_tests.sh new file mode 100755 index 0000000..cc7c636 --- /dev/null +++ b/tests/run_tests.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +ENCODER="$1" +DECODER="$2" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" +WORK_DIR="$(mktemp -d)" +trap 'rm -rf "$WORK_DIR"' EXIT + +roundtrip() { + local input="$1" + local tag="$2" + local compressed="${WORK_DIR}/${tag}.bin" + local decoded="${WORK_DIR}/${tag}.out" + + "$ENCODER" "$input" "$compressed" + "$DECODER" "$compressed" "$decoded" + diff -q "$input" "$decoded" +} + +fail_encode() { + local input="$1" + local tag="$2" + local compressed="${WORK_DIR}/${tag}.bin" + + if "$ENCODER" "$input" "$compressed" 2>/dev/null; then + echo "Expected encoder failure for: $input" >&2 + return 1 + fi + return 0 +} + +echo "Test: sample text roundtrip" +roundtrip "${ROOT_DIR}/original/OriginalFile.txt" sample + +echo "Test: single-character file" +printf 'aaaa' > "${WORK_DIR}/single.txt" +roundtrip "${WORK_DIR}/single.txt" single + +echo "Test: two-character file" +printf 'ab' > "${WORK_DIR}/two.txt" +roundtrip "${WORK_DIR}/two.txt" two + +echo "Test: upper ASCII (0x7F)" +printf '\x7f' > "${WORK_DIR}/del.txt" +roundtrip "${WORK_DIR}/del.txt" del + +echo "Test: non-ASCII byte rejected" +printf '\xff' > "${WORK_DIR}/invalid.txt" +fail_encode "${WORK_DIR}/invalid.txt" invalid + +echo "Test: empty file rejected" +touch "${WORK_DIR}/empty.txt" +fail_encode "${WORK_DIR}/empty.txt" empty + +echo "All tests passed."