diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fe4511..1f71572 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,13 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) add_executable(freq_counter src/freq_counter/main.cpp src/lib/utils.cpp) -add_executable(compressor src/compressor/main.cpp src/lib/huffman_tree.cpp src/lib/bit_stream.cpp src/lib/utils.cpp) +add_executable(compressor + src/compressor/main.cpp + src/compressor/compressor.cpp + src/lib/huffman_tree.cpp + src/lib/bit_stream.cpp + src/lib/utils.cpp +) target_include_directories(freq_counter PRIVATE ${CMAKE_SOURCE_DIR}/include) target_include_directories(compressor PRIVATE ${CMAKE_SOURCE_DIR}/include) diff --git a/code_test.cpp b/code_test.cpp new file mode 100644 index 0000000..012a6d3 --- /dev/null +++ b/code_test.cpp @@ -0,0 +1,15 @@ +#include +#include + +// Um loop simples para testar palavras-chave +int main() { + std::vector vec; + for (int i = 0; i < 10; ++i) { + vec.push_back(i); + } + + if (!vec.empty()) { + std::cout << "O vetor nao esta vazio!" << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/codigo_comprimido.huff b/codigo_comprimido.huff new file mode 100644 index 0000000..4dd3575 Binary files /dev/null and b/codigo_comprimido.huff differ diff --git a/codigo_restaurado.cpp b/codigo_restaurado.cpp new file mode 100644 index 0000000..012a6d3 --- /dev/null +++ b/codigo_restaurado.cpp @@ -0,0 +1,15 @@ +#include +#include + +// Um loop simples para testar palavras-chave +int main() { + std::vector vec; + for (int i = 0; i < 10; ++i) { + vec.push_back(i); + } + + if (!vec.empty()) { + std::cout << "O vetor nao esta vazio!" << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/compressor b/compressor new file mode 100644 index 0000000..7a68841 Binary files /dev/null and b/compressor differ diff --git a/contador_frequencia b/contador_frequencia new file mode 100644 index 0000000..5e054fd Binary files /dev/null and b/contador_frequencia differ diff --git a/examples/small_texts/README.md b/examples/small_texts/README.md deleted file mode 100644 index e799938..0000000 --- a/examples/small_texts/README.md +++ /dev/null @@ -1 +0,0 @@ -Small texts to test compression. \ No newline at end of file diff --git a/include/compressor/compressor.h b/include/compressor/compressor.h new file mode 100644 index 0000000..5187231 --- /dev/null +++ b/include/compressor/compressor.h @@ -0,0 +1,18 @@ +#pragma once +#include +#include +#include + +namespace compressor { + +// Compress: se freq_table_path vazio, conta bytes do input. +// Se freq_table_path não for vazio, tenta usar a tabela (suporta tokens multi-char). +void compress_file(const std::string &inputPath, + const std::string &outputPath, + const std::string &freq_table_path = ""); + +// Decompress: lê arquivo compactado e escreve output +void decompress_file(const std::string &inputPath, + const std::string &outputPath); + +} // namespace compressor diff --git a/src/compressor/compressor.cpp b/src/compressor/compressor.cpp new file mode 100644 index 0000000..f3f555e --- /dev/null +++ b/src/compressor/compressor.cpp @@ -0,0 +1,209 @@ +#include "../../include/compressor/compressor.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../include/huffman/huffman_tree.h" +#include "../../include/huffman/bit_stream.h" + +using namespace std; + +namespace compressor { + +// ================================================================================= +// FUNÇÕES AUXILIARES +// ================================================================================= + +// Lê um arquivo inteiro para uma string. +static string readFileToString(const string &path) { + ifstream in(path, ios::binary); + if (!in) throw runtime_error("Nao foi possivel abrir o arquivo de entrada: " + path); + ostringstream ss; + ss << in.rdbuf(); + return ss.str(); +} + +// Conta a frequência de cada byte individualmente. +static map countByteFreqs(const string &data) { + map freqs; + for (unsigned char c : data) { + string s(1, static_cast(c)); + freqs[s]++; + } + return freqs; +} + +// Converte sequências de escape (ex: "\\n") para seus caracteres reais (ex: '\n'). +static string unescape_chars(const std::string& s) { + std::string unescaped; + for (size_t i = 0; i < s.length(); ++i) { + if (s[i] == '\\' && i + 1 < s.length()) { + i++; // Pula a barra invertida + if (s[i] == 'n') { + unescaped += '\n'; + } else if (s[i] == 'r') { + unescaped += '\r'; + } else if (s[i] == 't') { + unescaped += '\t'; + } else if (s[i] == '\\') { + unescaped += '\\'; + } + } else { + unescaped += s[i]; + } + } + return unescaped; +} + +// Lê o arquivo de tabela de frequência ("simbolo:contagem"). +static map readFreqTable(const string &path) { + map freqs; + if (path.empty()) return freqs; + ifstream in(path, ios::binary); + if (!in) throw runtime_error("Nao foi possivel abrir o arquivo de tabela de frequencia: " + path); + string line; + while (getline(in, line)) { + if (line.empty()) continue; + size_t pos = line.rfind(':'); + if (pos == string::npos) continue; + + string key = line.substr(0, pos); + string val = line.substr(pos + 1); + + try { + uint64_t v = stoull(val); + freqs[unescape_chars(key)] = v; + } catch (...) { + // Ignora linhas com formato inválido + } + } + return freqs; +} + +// Quebra o texto de entrada em tokens (símbolos) usando a abordagem "greedy". +static vector tokenize_greedy(const string &text, const vector &symbols_sorted) { + vector out; + size_t i = 0; + const size_t n = text.size(); + while (i < n) { + bool matched = false; + for (const string &sym : symbols_sorted) { + size_t L = sym.size(); + if (L == 0) continue; + if (i + L <= n && memcmp(text.data() + i, sym.data(), L) == 0) { + out.push_back(sym); + i += L; + matched = true; + break; + } + } + if (!matched) { + out.emplace_back(string(1, text[i])); + ++i; + } + } + return out; +} + + +// ================================================================================= +// FUNÇÕES PRINCIPAIS +// ================================================================================= + +// Função principal para comprimir um arquivo. +void compress_file(const string &inputPath, const string &outputPath, const string &freq_table_path) { + string data = readFileToString(inputPath); + map freqs = readFreqTable(freq_table_path); + vector tokens; + + // Se uma tabela de frequência foi fornecida, usa a tokenização inteligente. + if (!freqs.empty()) { + vector symbols; + for (const auto &p : freqs) { + symbols.push_back(p.first); + } + sort(symbols.begin(), symbols.end(), [](const string &a, const string &b) { + if (a.size() != b.size()) return a.size() > b.size(); + return a < b; + }); + + tokens = tokenize_greedy(data, symbols); + } else { + // Senão, faz a compressão simples por byte. + freqs = countByteFreqs(data); + for (unsigned char c : data) { + tokens.emplace_back(string(1, static_cast(c))); + } + } + + // Constrói a Árvore de Huffman. + HuffmanTree tree; + tree.build(freqs); + + ofstream out(outputPath, ios::binary); + if (!out) throw runtime_error("Nao foi possivel abrir o arquivo de saida: " + outputPath); + + // Salva a árvore no início do arquivo. + tree.serialize(out); + + // Escreve os dados comprimidos bit a bit. + BitOutputStream bout(out); + auto codes = tree.getCodes(); + + for (const auto &tok : tokens) { + auto it = codes.find(tok); + if (it == codes.end()) { + throw runtime_error("Nao foi encontrado codigo para o token durante a compressao. Token: '" + tok + "'"); + } + for (bool b : it->second) { + bout.writeBit(b); + } + } + bout.flush(); + out.close(); + + cout << "Compressed " << inputPath << " -> " << outputPath << endl; +} + +// Função principal para descomprimir um arquivo. +void decompress_file(const string &inputPath, const string &outputPath) { + ifstream in(inputPath, ios::binary); + if (!in) throw runtime_error("Nao foi possivel abrir o arquivo comprimido: " + inputPath); + + // Lê a árvore do início do arquivo. + HuffmanTree tree; + tree.deserialize(in); + + auto root = tree.getRoot(); + if (!root) { return; } + + ofstream out(outputPath, ios::binary); + if (!out) throw runtime_error("Nao foi possivel abrir o arquivo de saida: " + outputPath); + + // Lê os bits e percorre a árvore para decodificar. + BitInputStream bin(in); + auto node = root; + while (true) { + int b = bin.readBit(); + if (b == -1) break; + + node = (b == 0) ? node->left : node->right; + + if (node->isLeaf()) { + out << node->symbol; + node = root; + } + } + + out.close(); + in.close(); +} + +} // namespace compressor \ No newline at end of file diff --git a/src/compressor/main.cpp b/src/compressor/main.cpp index 35fcac8..deebf25 100644 --- a/src/compressor/main.cpp +++ b/src/compressor/main.cpp @@ -1,15 +1,52 @@ -#include -#include -#include -#include "../../include/huffman/bit_stream.h" -#include "../../include/huffman/huffman_tree.h" - -int main(int argc, char** argv){ - if(argc < 2){ - std::cerr << "Usage: compressor -c|-d ...\n"; - return 1; - } - // This is a stub. Implement parsing -c (compress) and -d (decompress) and call appropriate functions. - std::cout << "Compressor stub. Implement command-line parsing and logic.\n"; - return 0; -} +#include +#include "../../include/compressor/compressor.h" + +using namespace std; + +static void printUsage() { + cerr << "Usage:\n" + << " compressor -c [-f freq_table] -i -o \n" + << " compressor -d -i -o \n"; +} + +int main(int argc, char** argv) { + if(argc < 2) { printUsage(); return 1; } + + bool doCompress = false, doDecompress = false; + string freqFile, inputFile, outputFile; + + for(int i=1;i " << outputFile << endl; + } else if(doDecompress) { + if(inputFile.empty() || outputFile.empty()) { printUsage(); return 1; } + compressor::decompress_file(inputFile, outputFile); + cout << "Decompressed " << inputFile << " -> " << outputFile << endl; + } else { + printUsage(); + return 1; + } + } catch(const exception &ex) { + cerr << "Error: " << ex.what() << "\n"; + return 2; + } + + return 0; +} diff --git a/src/freq_counter/main.cpp b/src/freq_counter/main.cpp index d29bfaa..330db89 100644 --- a/src/freq_counter/main.cpp +++ b/src/freq_counter/main.cpp @@ -6,7 +6,26 @@ #include #include "../../include/huffman/huffman_tree.h" #include +#include +// Função para escapar caracteres especiais +std::string escape_chars(const std::string& s) { + std::string escaped; + for (char c : s) { + if (c == '\n') { + escaped += "\\n"; + } else if (c == '\r') { + escaped += "\\r"; + } else if (c == '\t') { + escaped += "\\t"; + } else if (c == '\\') { + escaped += "\\\\"; + } else { + escaped += c; + } + } + return escaped; +} //Function to help in case the loop find a char in the file std::string charToString(char c) { @@ -131,9 +150,9 @@ int main(int argc, char** argv) { } // Formato do map simbolo:fequencia - for (const auto& pair : freqs) { - (*out_stream) << pair.first << ":" << pair.second << "\n"; - } + for (const auto& pair : freqs) { + (*out_stream) << escape_chars(pair.first) << ":" << pair.second << "\n"; +} if (out_file.is_open()) { out_file.close(); diff --git a/src/lib/huffman_tree.cpp b/src/lib/huffman_tree.cpp index 560d884..8750137 100644 --- a/src/lib/huffman_tree.cpp +++ b/src/lib/huffman_tree.cpp @@ -3,7 +3,7 @@ #include #include -void serialize_helper(std::shared_ptr node, BitOutputStream& bit_out) { +void HuffmanTree::serialize_helper(std::shared_ptr node, BitOutputStream& bit_out) const { if (node == nullptr) { return; } @@ -27,7 +27,7 @@ void serialize_helper(std::shared_ptr node, BitOutputStream& bit_out) { } } -std::shared_ptr deserialize_helper(BitInputStream& bit_in) { +std::shared_ptr HuffmanTree::deserialize_helper(BitInputStream& bit_in) { int bit = bit_in.readBit(); if (bit == -1) { return nullptr; diff --git a/tabela_cpp.freq b/tabela_cpp.freq new file mode 100644 index 0000000..b32839d --- /dev/null +++ b/tabela_cpp.freq @@ -0,0 +1,53 @@ +\n:14 +\r:14 + :71 +!:2 +":2 +#:2 +(:5 +):5 ++:2 +-:1 +.:2 +/:2 +0:3 +1:1 +::6 +;:6 +<:8 +=:1 +>:3 +O:1 +U:1 +_:1 +a:12 +b:1 +c:5 +cout:1 +e:9 +endl:1 +for:1 +h:2 +i:7 +if:1 +include:2 +int:3 +iostream:1 +k:1 +l:3 +m:4 +n:2 +o:5 +p:6 +r:4 +return:1 +s:6 +std:3 +t:5 +u:1 +v:7 +vector:2 +y:1 +z:1 +{:3 +}:3