/* * huffman - Encode/Decode files using Huffman encoding. * Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., */ /* * 针对可打印字符的huffman编解码. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <assert.h> #include "huffman.h" #ifdef WIN32 #include <winsock2.h> #include <winsock.h> #include <malloc.h> #define alloca _alloca #else #include <netinet/in.h> #include <sys/socket.h> #include <sys/types.h> #endif // --------------------------------------------------------------------------------------- /* 描述huffman树的节点 */ typedef struct huffman_node_tag { unsigned char isLeaf; // 当前节点是否为叶子节点 unsigned long count; // 权(字符出现的次数) struct huffman_node_tag *parent; // 当前节点的父亲 union { /* * 如果改节点为叶子节点, 则有unsigned char symbol字段, * 如果该节点不是叶子节点, 则具有左孩子右孩子字段. */ struct { struct huffman_node_tag *zero; // 左孩子 struct huffman_node_tag *one; // 右孩子 }; unsigned char symbol; // 存储的字符. }; } huffman_node; /* 编码 */ typedef struct huffman_code_tag { /* * numbits 这个字段是用来记录从leaf--->root一共走了多少步. * 也就是叶子节点所含字符的编码长度 * bits 是用来存储编码的, 但是它是以Byte为单位的, 而编码是 * 以bit为单位的, 所以需要根据 numbits 去从 bits 里提取出前 * numbits 个bit。 这才是真正的编码. * * 比如 * numbits=4 * bits = 0110 1011 * 那么我们应该提取bits的低4位 1011 * * 注意 numbits 和 bits是有关系的, bits一定不可能超过 numbits/8 */ /* The length of this code in bits. */ unsigned long numbits;/* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */ /* The bits that make up this code. The first bit is at position 0 in bits[0]. The second bit is at position 1 in bits[0]. The eighth bit is at position 7 in bits[0]. The ninth bit is at position 0 in bits[1]. */ unsigned char *bits; /* 用来存储编码String */ } huffman_code; int main(int argc, char** argv) { /* char memory = 0; char compress = 1; int opt; const char *file_in = NULL; const char *file_out = NULL; */ //FILE *in = stdin; //FILE *out = stdout; FILE *in = NULL; FILE *out = NULL; FILE *out2 = NULL; in = fopen("test.txt", "r+"); out = fopen("test1.txt", "w+"); //out2 = fopen("test2.txt", "w+"); int result = 0; result = huffman_encode_file(in, out); //result = huffman_decode_file(out, out2); printf("result : %d/n", result); system("pause"); return 1; } // --------------------------------------------------------------------------------------- /** * bit-->byte转换, 不足8位则补齐8位. */ static unsigned long numbytes_from_numbits(unsigned long numbits) { return numbits / 8 + (numbits % 8 ? 1 : 0); } /* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. * 取出第i位(从低位往高位排) */ static unsigned char get_bit(unsigned char *bits, unsigned long i) { return ( bits[i/8] >> (i%8) ) & 1; } /** * 反转数组的前((numbits/8)+1)个字符 * */ static void reverse_bits(unsigned char* bits, unsigned long numbits) { unsigned long numbytes = numbytes_from_numbits(numbits); // 所占字节数. unsigned char *tmp = (unsigned char*)calloc(numbytes, sizeof(unsigned char)); // 分配内存 unsigned long curbit; // index -- 当前位 long curbyte = 0; // 当前是byte的index memset(tmp, 0, numbytes); // 将tmp指向的buffer清零 /* * */ for(curbit=0; curbit<numbits; ++curbit) { unsigned int bitpos = curbit % 8; // 当前byte里的index if(curbit>0 && curbit%8==0) { // ++curbyte; } /* * 按位 OR * */ tmp[curbyte] |= ( get_bit(bits, numbits-curbit-1) << bitpos ); } memcpy(bits, tmp, numbytes); } /* * new_code builds a huffman_code from a leaf in * a Huffman tree. * 对指定的叶子进行编码. */ static huffman_code* new_code(const huffman_node* leaf) { /* Build the huffman code by walking up to * the root node and then reversing the bits, * since the Huffman code is calculated by * walking down the tree. */ unsigned long numbits = 0; unsigned char *bits = NULL; huffman_code *p; while(leaf!=NULL && leaf->parent!=NULL) { huffman_node *parent = leaf->parent; // 当前活动节点的双亲 unsigned long cur_byte = numbits / 8; // 当前byte unsigned char cur_bit = (unsigned char)(numbits % 8); // 当前byte里的当前bit /* If we need another byte to hold the code, then allocate it. */ if(cur_bit == 0) { // 满了一个byte就需要重新分配内存 size_t newSize = cur_byte + 1; bits = (char*)realloc(bits, newSize); bits[newSize - 1] = 0; /* Initialize the new byte. */ } /* * ************************************************************************ * 需要注意的是右孩子的处理, 因为左孩子已经是0, 初始分配bits的时候就已经把每位都设置为0了. * ************************************************************************ */ /* If a one must be added then or it in. If a zero * must be added then do nothing, since the byte * was initialized to zero. */ if(leaf == parent->one) { /* * 右孩子的话就需要把当前位置为1 */ bits[cur_byte] |= (1<<cur_bit); } ++numbits; leaf = parent; } if( bits != 0) { /* * 如果编码里头含有1, 则需要进行反转, 如果全为0, 反转后和反转前是一样的, 就没必要反转了 */ reverse_bits(bits, numbits); } p = (huffman_code*)malloc(sizeof(huffman_code)); p->numbits = numbits; /* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */ p->bits = bits; /* 用来存储编码的区间 */ return p; } /** * 创建一个孤立的"叶子"节点, 该节点为一个单独的树 * symbol : 该叶子节点的权,即字符 */ static huffman_node* new_leaf_node(unsigned char symbol) { huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) ); p->isLeaf = 1; p->symbol = symbol; p->count = 0; p->parent = 0; return p; } /** * 创建节点(该节点不是叶子节点) * * count : 字符出现的次数 * zero : 左孩子 * one : 右兄弟 * * return : 返回一个节点对象 */ static huffman_node* new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one) { huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) ); p->isLeaf = 0; p->count = count; p->zero = zero; p->one = one; p->parent = 0; return p; } /** * 释放树占用的内存空间 */ static void free_huffman_tree(huffman_node *subtree) { if(subtree == NULL) return; if( !(subtree->isLeaf) ) { /* * 先序遍历进行递归调用 */ free_huffman_tree( subtree->zero ); free_huffman_tree( subtree->one ); } free( subtree ); } /** * free code */ static void free_code(huffman_code* p) { free(p->bits); free(p); } // ---------------------------------------------------------------------------------------- #define MAX_SYMBOLS 256 typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS]; /* */ typedef huffman_code* SymbolEncoder[MAX_SYMBOLS]; /* */ /* */ static void free_encoder(SymbolEncoder *pSE) { unsigned long i; for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*pSE)[i]; if( p ) free_code(p); } } /* */ static void init_frequencies(SymbolFrequencies *pSF) { memset(*pSF, 0, sizeof(SymbolFrequencies) ); /* 清零 */ } // ---------------------------------------------------------------------------------------- typedef struct buf_cache_tag { /* * 该结构主要描述了两个部分, 一个是cache, 一个是bufout. * cache是一个临时存储数据的buffer, cache会将数据写往bufout区间, * bufout类似一个仓库, 会一直存储cache写入的数据. * cache可以多次网bufout内写数据, bufout会一直保存这些数据. * bufout是一个动态的buffer, cache每一次往bufout内写数据的时候bufout都需要realloc一次. */ unsigned char *cache; // 指向真正存储数据的buffer unsigned int cache_len; // buffer的长度, 初始的时候就可以设置 cache的大小的 unsigned int cache_cur; // 数据结尾处(或者说是光标位置) unsigned char **pbufout; /* * cache要写数据就往这个空间内写(类似一个动态仓库, 一定是动态的) * (*pbufout)就是真实的存储区 */ unsigned int *pbufoutlen; // 仓库的大小 } buf_cache; /* 初始化一个buf_cache */ static int init_cache(buf_cache *pc, unsigned int cache_size, unsigned char **pbufout, unsigned int *pbufoutlen) { assert(pc && pbufout && pbufoutlen); if(!pbufout || !pbufoutlen) return 1; pc->cache = (unsigned char*)malloc(cache_size); // 分配存储空间 pc->cache_len = cache_size; // pc->cache_cur = 0; // 光标从0开始 pc->pbufout = pbufout; // *pbufout = NULL; // pc->pbufoutlen = pbufoutlen; *pbufoutlen = 0; // return (pc->cache==NULL 0 : 1); } /* 释放buf_cache */ static void free_cache(buf_cache* pc) { assert( pc ); if( pc->cache != NULL) { /* * 我觉得这里没有必要free( pc->cache );, 直接执行pc->cache = NULL;就可以保证 * 逻辑上清cache了, 至于真实的存储区内是否仍然有数据并没有意义. */ free( pc->cache ); pc->cache = NULL; } } /* * 将cache内的数据写到pbufout中去, 并清洗cache * 成功则返回0, 失败返回1. */ static int flush_cache(buf_cache* pc) { assert( pc ); if(pc->cache_cur > 0) // 确定cache_cur有平移, 这样才能确定cache内有数据. 才可以flush { /* 当前要写的数据长度为pc->cache_cur, 原来的bufout里头本身还有*(pc->pbufoutlen)长度的数据 */ unsigned int newlen = pc->cache_cur + *(pc->pbufoutlen); /* 需要重新为*(pc->pbufout)分配空间, tmp为这个新buffer的首地址 */ unsigned char* tmp = realloc(*(pc->pbufout), newlen); if( !tmp ) return 1; /* 追加到pbufout结尾处, 而不是覆盖 */ memcpy(tmp + *(pc->pbufoutlen), pc->cache, pc->cache_cur); *pc->pbufout = tmp; // pbufout指针重定位到新的扩大了的内存区. *pc->pbufoutlen = newlen; // 重新计算pbufoutlen pc->cache_cur = 0; // cache逻辑上清零 } return 0; } /* 写cache */ static int write_cache(buf_cache* pc, const void *to_write, unsigned int to_write_len) { unsigned char* tmp; assert(pc && to_write); assert(pc->cache_len >= pc->cache_cur); /* If trying to write more than the cache will hold * flush the cache and allocate enough space immediately, * that is, don't use the cache. */ if(to_write_len > pc->cache_len - pc->cache_cur) { /* * to_write_len : 需要往cache内写的数据长度 * pc->cache_len-pc->cache_cur : cache内剩余空间 * 如果cache存储能力不够则先清洗cache, 再将数据直接写到 * pbufout中去, 不使用cache. */ unsigned int newlen; flush_cache( pc ); newlen = *pc->pbufoutlen + to_write_len; tmp = realloc(*pc->pbufout, newlen); if( !tmp ) return 1; memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len); *pc->pbufout = tmp; *pc->pbufoutlen = newlen; } else { /* * Write the data to the cache * 如果cache存储能力足够,则往cache内追加数据且将当前光标移动到新的数据尾部. */ memcpy(pc->cache+pc->cache_cur, to_write, to_write_len); pc->cache_cur += to_write_len; } return 0; } // ------------------------------------------------------------------------------------- /* * 计算FILE对象内的各个字符出现的频率 * 扫描FILE对象, */ static unsigned int get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in) { int c; unsigned int total_count = 0; // FILE对象内的字符总数 init_frequencies( pSF ); /* Set all frequencies to 0. */ /* Count the frequency of each symbol in the input file. */ while( (c=fgetc(in)) != EOF ) { unsigned char uc = c; if( !(*pSF)[uc] ) // 如果这个字符没有出现过则为这个字符建立一个叶子 { /* * 这里设计的相当有意思, * fgetc会获得当前光标所在的字符, 这个字符必然是一个char, 也就是一个 * unsinged int型数. * 我们前面有定义 * #define MAX_SYMBOLS 256 * typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS]; * typedef huffman_code* SymbolEncoder[MAX_SYMBOLS]; * 这里我们采用SymbolFrequencies[uc]就存储c这个字符. * 也就是说 * SymbolFrequencies[0]==0x00 * SymbolFrequencies[1]==0x01 * ...... * SymbolFrequencies[65]==A ( A==65 :) ) * SymbolFrequencies[66]==B ( B==66 :) ) * ...... * 起初在typedf SymbolFrequencies的时候, 我着实没看懂, 实在太巧妙了. */ (*pSF)[uc] = new_leaf_node( uc ); } /* * 这个自加也非常巧妙, 遇到uc字符则将第uc个huffman_node的count自加 * 实际就是一个字符一个字符读下去,, * 读到了A就 ++(pSF['A'].count) == ++(pSF[65].count) * 读到了B就 ++(pSF['B'].count) == ++(pSF[66].count) * 真TNND的牛X! */ ++( (*pSF)[uc]->count ); ++total_count; } return total_count; } /* 计算buffer内各个字符的频率,和get_symbol_frequencies函数同理 */ static unsigned int get_symbol_frequencies_from_memory(SymbolFrequencies *pSF, const unsigned char *bufin, unsigned int bufinlen) { unsigned int i; unsigned int total_count = 0; /* Set all frequencies to 0. */ init_frequencies(pSF); /* Count the frequency of each symbol in the input file. */ for(i = 0; i < bufinlen; ++i) { unsigned char uc = bufin[i]; if( !(*pSF)[uc] ) { (*pSF)[uc] = new_leaf_node(uc); } ++(*pSF)[uc]->count; ++total_count; } return total_count; } /* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. * * 两个huffman_node进行对比, 以count作为比较依据. * 即对比两个不同 symbol 出现的频率 * 非叶子节点通通排在后面 */ static int SFComp(const void *p1, const void *p2) { const huffman_node *hn1 = *(const huffman_node**)p1; const huffman_node *hn2 = *(const huffman_node**)p2; /* Sort all NULLs to the end. */ if(hn1 == NULL && hn2 == NULL) return 0; if(hn1 == NULL) return 1; if(hn2 == NULL) return -1; if(hn1->count > hn2->count) return 1; else if(hn1->count < hn2->count) return -1; return 0; } /* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. * * 递归方式为每个叶子节点编码 */ static void build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSE) { if(subtree == NULL) return; if( subtree->isLeaf ) { (*pSE)[subtree->symbol] = new_code( subtree ); } else { build_symbol_encoder(subtree->zero, pSE); build_symbol_encoder(subtree->one, pSE); } } /* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. * * 为每个node编码. 这个函数比较重要, 精华就是在这个函数里头的for循环. 哈哈 * 整个tree的建立全都依赖这个函数 */ static SymbolEncoder* calculate_huffman_codes(SymbolFrequencies * pSF) { unsigned int i = 0; unsigned int n = 0; huffman_node *m1 = NULL, *m2 = NULL; SymbolEncoder *pSE = NULL; /* * Sort the symbol frequency array by ascending frequency. * 快速排序例程进行排序 * 以symbol频率为关键字做升序排列 * 有symbol的节点都会按升序排列, 没有symbol的节点会统一排在后面, * 通过一个for就能计算出symbol的个数了. */ qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp); /* * Get the number of symbols. * 计算huffman树中的字符数, 这个实现可读性不够好 */ for(n = 0; (n<MAX_SYMBOLS) && (*pSF)[n]; ++n) ; /* * Construct a Huffman tree. This code is based * on the algorithm given in Managing Gigabytes * by Ian Witten et al, 2nd edition, page 34. * Note that this implementation uses a simple * count instead of probability. */ for(i = 0; i < (n-1); ++i) { /* Set m1 and m2 to the two subsets of least probability. */ m1 = (*pSF)[0]; m2 = (*pSF)[1]; /* Replace m1 and m2 with a set {m1, m2} whose probability * is the sum of that of m1 and m2. * 这个算法有优化的余地的, 因为n在一直减小. * 将最小的两个元素合并后得到一个一个节点为m12, 此时m1,m2已经建立起来了关系. * 这个m12的地址又被pSF[0]存储, 循环直至整个Tree建立成功. * 指针在这里运用的实在太巧妙了. * 这一行代码就是建树, 靠,NBA! */ (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2); (*pSF)[1] = NULL; /* * Put newSet into the correct count position in pSF. * 这里应该可以再进行优化, 是否有必要再进行排序, 或者被排序的数组过长了. * 实际上每循环一次n都减少了一次 */ qsort((*pSF), n, sizeof((*pSF)[0]), SFComp); }/* for完毕的时候就求出了root, pSF[0]就是root, 后面的元素都是NULL * 而树通过for循环里头的 * (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2); * 已经建立完成了*/ /* Build the SymbolEncoder array from the tree. */ pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder)); memset(pSE, 0, sizeof(SymbolEncoder)); build_symbol_encoder((*pSF)[0], pSE); return pSE; } /* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded * (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. * * 编码后的格式 : * 0-3个byte是FILE内出现的不同字符个数(几不同的字符个数) * 4-7个byte是FILE内出现的全部字符个数(所有的字符) * 8-X是真正的编码后值 * */ static int write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count) { unsigned long i, count = 0; /* * Determine the number of entries in se * 计算 SymbolEncoder 内具有编码值的元素个数. * 即有几种字符 */ for(i = 0; i < MAX_SYMBOLS; ++i) if( (*se)[i] ) ++count; /* * Write the number of entries in network byte order. * 将字符种数写入到文件头部, 即[0, 3]一共4个字节 */ //i = htonl( count ); i = count; if(fwrite(&i, sizeof(i), 1, out) != 1) return 1; /* * Write the number of bytes that will be encoded. * 将字符个数追加到[4,7]一共4个字节 */ //symbol_count = htonl(symbol_count); symbol_count = symbol_count; if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) return 1; /* * Write the entries. */ for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*se)[i]; if( p != NULL ) { /* * 每个单元分为三个部分 : * symbol -- 字符 * numbits -- 叶子走到root需要的步数 * bits -- 叶子走到root的方式(即最终的编码, 比如说0101) */ unsigned int numbytes; /* Write the 1 byte symbol. */ fputc((unsigned char)i, out); /* Write the 1 byte code bit length. */ fputc(p->numbits, out); /* Write the code bytes. 她这个注释就没有说是几byte, 值得思考一下 */ numbytes = numbytes_from_numbits( p->numbits ); /* 将叶子走到root的方式写进去, 这个方式会被整理为byte格式, 不够就补0 */ if(fwrite(p->bits, 1, numbytes, out) != numbytes) return 1; } } return 0; } /* * Allocates memory and sets *pbufout to point to it. The memory * contains the code table. * * 以指定的格式将编码后的数据写入到cache中去, 实际是写到pbufout中去了. * */ static int write_code_table_to_memory(buf_cache *pc, SymbolEncoder *se, unsigned int symbol_count) { unsigned long i, count = 0; /* Determine the number of entries in se. */ for(i = 0; i < MAX_SYMBOLS; ++i) { if((*se)[i]) { ++count; // 计算不同字符的个数 } } /* Write the number of entries in network byte order. */ //i = htonl(count); i = count; if( write_cache(pc, &i, sizeof(i)) ) // 前四个字节是memory内所有字符数 return 1; /* Write the number of bytes that will be encoded. */ //symbol_count = htonl(symbol_count); symbol_count = symbol_count; if( write_cache(pc, &symbol_count, sizeof(symbol_count)) ) // 4-8字节是不同字符个数 return 1; /* Write the entries. */ for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*se)[i]; if( p ) { /* * 对于每次循环来说, 如果p不为NULL, 则将该字符对应的编码写入到cache内. * 存储格式为三个字节作为一个单位. * byte0 --- 字符本身 * byte1 --- 该字符编码后的码值长度(即2进制的位数) * byte2 --- 该字符对应的码值 */ unsigned int numbytes; /* * The value of i is < MAX_SYMBOLS (256), so it can * be stored in an unsigned char. * 将i转换为char型, 可以对应到字符集 */ unsigned char uc = (unsigned char)i; /* * Write the 1 byte symbol. * 将字符写到cache内 */ if(write_cache(pc, &uc, sizeof(uc))) return 1; /* * Write the 1 byte code bit length. * 将叶子节点到root所需要经过的步数写到cache内, 也就是编码的长度 * 这个数据是为了解码使用的. */ uc = (unsigned char)p->numbits; if(write_cache(pc, &uc, sizeof(uc))) return 1; /* * Write the code bytes. * 将编码值对齐并写如到cache内 * 事先必须知道编码由几位组成, 如果编码为9位, 那么就需要2个byte来存储这个码值 * 如果编码为4位, 那么就需要1个byte来存储了, */ numbytes = numbytes_from_numbits(p->numbits); if(write_cache(pc, p->bits, numbytes)) return 1; } } return 0; } /* * read_code_table builds a Huffman tree from the code * in the in file. This function returns NULL on error. * The returned value should be freed with free_huffman_tree. * * */ static huffman_node* read_code_table(FILE* in, unsigned int *pDataBytes) { huffman_node *root = new_nonleaf_node(0, NULL, NULL); unsigned int count; /* * Read the number of entries. * (it is stored in network byte order). * 获得字符种数, 前2个byte就是出现的字符种数 */ if( fread(&count, sizeof(count), 1, in) != 1 ) { free_huffman_tree( root ); return NULL; } //count = ntohl(count); count = count; /* * Read the number of data bytes this encoding represents. * 一个有多少个字符 */ if( fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1 ) { free_huffman_tree(root); return NULL; } //*pDataBytes = ntohl(*pDataBytes); *pDataBytes = *pDataBytes; /* Read the entries. */ while(count-- > 0) { int c; unsigned int curbit; unsigned char symbol; unsigned char numbits; unsigned char numbytes; unsigned char *bytes; huffman_node *p = root; if( (c=fgetc(in)) == EOF ) { free_huffman_tree( root ); return NULL; } symbol = (unsigned char)c; if( (c=fgetc(in)) == EOF ) { free_huffman_tree( root ); return NULL; } numbits = (unsigned char)c; numbytes = (unsigned char)numbytes_from_numbits( numbits ); bytes = (unsigned char*)malloc( numbytes ); if( fread(bytes, 1, numbytes, in) != numbytes ) { free(bytes); free_huffman_tree(root); return NULL; } /* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. */ for(curbit = 0; curbit < numbits; ++curbit) { if(get_bit(bytes, curbit)) { if(p->one == NULL) { p->one = curbit == (unsigned char)(numbits-1) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->one->parent = p; } p = p->one; } else { if(p->zero == NULL) { p->zero = curbit == (unsigned char)(numbits - 1) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->zero->parent = p; } p = p->zero; } } free(bytes); } return root; } /* * 将数据从buf读到bufout中, 成功则返回0, 其他则返回1. * pindex -- 拷贝的起点 */ static int memread(const unsigned char* buf, unsigned int buflen, unsigned int *pindex, void* bufout, unsigned int readlen) { assert(buf && pindex && bufout); assert(buflen >= *pindex); // 错误 if(buflen < *pindex) return 1; if(readlen + *pindex >= buflen) return 1; memcpy(bufout, buf + *pindex, readlen); *pindex += readlen; return 0; } /* * 从编码后的buf内读数据. */ static huffman_node* read_code_table_from_memory(const unsigned char* bufin, unsigned int bufinlen, unsigned int *pindex, unsigned int *pDataBytes) { huffman_node *root = new_nonleaf_node(0, NULL, NULL); unsigned int count; /* * Read the number of entries. * (it is stored in network byte order). * 读取 */ if( memread(bufin, bufinlen, pindex, &count, sizeof(count)) ) { free_huffman_tree(root); return NULL; } //count = ntohl(count); count = count; /* Read the number of data bytes this encoding represents. */ if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))) { free_huffman_tree(root); return NULL; } //*pDataBytes = ntohl(*pDataBytes); *pDataBytes = *pDataBytes; /* Read the entries. */ while( (count--) > 0 ) { unsigned int curbit; unsigned char symbol; unsigned char numbits; unsigned char numbytes; unsigned char *bytes; huffman_node *p = root; if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))) { free_huffman_tree(root); return NULL; } if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))) { free_huffman_tree(root); return NULL; } numbytes = (unsigned char)numbytes_from_numbits(numbits); bytes = (unsigned char*)malloc(numbytes); if(memread(bufin, bufinlen, pindex, bytes, numbytes)) { free(bytes); free_huffman_tree(root); return NULL; } /* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. */ for(curbit = 0; curbit < numbits; ++curbit) { if(get_bit(bytes, curbit)) { if(p->one == NULL) { p->one = ( curbit==(unsigned char)(numbits - 1) ) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->one->parent = p; } p = p->one; } else { if(p->zero == NULL) { p->zero = curbit == (unsigned char)(numbits - 1) new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->zero->parent = p; } p = p->zero; } } free(bytes); } return root; } /* * 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作 * 也就是不将编码强制为byte类型了, 而是直接写入到out中. */ static int do_file_encode(FILE* in, FILE* out, SymbolEncoder *se) { unsigned char curbyte = 0; unsigned char curbit = 0; int c; while( (c = fgetc(in)) != EOF) { //printf("c=%c /n", c); unsigned char uc = (unsigned char)c; huffman_code *code = (*se)[uc]; unsigned long i; for(i = 0; i < code->numbits; ++i) { //printf("i=%d /n", i); /* Add the current bit to curbyte. */ curbyte |= get_bit(code->bits, i) << curbit; /* If this byte is filled up then write it * out and reset the curbit and curbyte. */ if(++curbit == 8) { /* * 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作 * 也就是不将编码强制为byte类型了, 而是直接写入到out中. */ fputc(curbyte, out); printf("curbyte=%d/n", curbyte); // printf("curbyte=%d /n", curbyte); curbyte = 0; curbit = 0; } } printf("/n"); } /* * If there is data in curbyte that has not been * output yet, which means that the last encoded * character did not fall on a byte boundary, * then output it. */ if(curbit > 0) fputc(curbyte, out); return 0; } /* * 对memory进行编码. * * pc -- cache * bufin -- 待编码的buffer * bufinlen -- buffer长, 即字符个数 * se -- */ static int do_memory_encode(buf_cache *pc, const unsigned char *bufin, unsigned int bufinlen, SymbolEncoder *se) { unsigned char curbyte = 0; // unsigned char curbit = 0; unsigned int i; /* 对 bufin 内的字符依次循环 */ for(i = 0; i < bufinlen; ++i) { unsigned char uc = bufin[i]; huffman_code *code = (*se)[uc]; // 取出第i个字符的编码 unsigned long i; /* 对第i个字符编码长度进行循环 */ for(i = 0; i < code->numbits; ++i) { /* * Add the current bit to curbyte. * 依次取出 */ curbyte |= ( get_bit(code->bits, i) << curbit ); /* * If this byte is filled up then write it * out and reset the curbit and curbyte */ if(++curbit == 8) { /* * 满了一个字节则写cache * */ if(write_cache(pc, &curbyte, sizeof(curbyte))) return 1; curbyte = 0; curbit = 0; } } } /* * If there is data in curbyte that has not been * output yet, which means that the last encoded * character did not fall on a byte boundary, * then output it. */ return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0; } /* * huffman_encode_file huffman encodes in to out. * 对FILE对象进行编码, 将*in编码后写入*out. */ int huffman_encode_file(FILE *in, FILE *out) { SymbolFrequencies sf; SymbolEncoder *se; huffman_node *root = NULL; int rc; unsigned int symbol_count; /* * Get the frequency of each symbol in the input file. * 将FILE中的数据写入到SymbolFrequencies中, 计算出了各个字符出现频率, * 也计算出了FILE对象内的总字符数. */ symbol_count = get_symbol_frequencies(&sf, in); /* * Build an optimal table from the symbolCount. * 建树完毕, root就是sf[0], sf[1]=sf[2]=...sf[MAX_SYMBOLS]=NULL了 */ se = calculate_huffman_codes( &sf ); root = sf[0]; /* * Scan the file again and, using the table * previously built, encode it into the output file. */ rewind( in ); // 将文件指针重新指向一个流的开头 /* * 将编码写如文件流 */ rc = write_code_table(out, se, symbol_count); if(rc == 0) rc = do_file_encode(in, out, se); /* Free the Huffman tree. */ free_huffman_tree( root ); free_encoder(se); return rc; } /** * 对FILE进行解码 */ int huffman_decode_file(FILE *in, FILE *out) { huffman_node *root; huffman_node *p; int c; unsigned int data_count; /* Read the Huffman code table. */ root = read_code_table(in, &data_count); if( !root ) return 1; /* Decode the file. */ p = root; while(data_count>0 && (c=fgetc(in))!=EOF) { unsigned char byte = (unsigned char)c; unsigned char mask = 1; while(data_count > 0 && mask) { p = ( (byte&mask)? p->one : p->zero ); mask <<= 1; if( p->isLeaf ) { fputc(p->symbol, out); p = root; --data_count; } } } free_huffman_tree( root ); return 0; } // -------------------------------------------------------------------------------------- #define CACHE_SIZE 1024 /* * 对memory编码需要考虑到数据源的速度和bufout的接收速度不匹配, cache * cache是为了连接不同速度的设备而设计的 */ /** * 对buffer进行编码 * */ int huffman_encode_memory(const unsigned char *bufin, unsigned int bufinlen, unsigned char **pbufout, unsigned int *pbufoutlen) { SymbolFrequencies sf; SymbolEncoder *se; huffman_node *root = NULL; int rc; unsigned int symbol_count; // memory中的字符个数 buf_cache cache; // /* Ensure the arguments are valid. 检测参数合法性 */ if(!pbufout || !pbufoutlen) return 1; if( init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen) ) return 1; /* * Get the frequency of each symbol in the input memory * 计算bufin内各个字符出现的频率, 并求得bufin内存储的字符个数. */ symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen); /* * Build an optimal table from the symbolCount. * 为每个Node编码, 如果这个Node的symbol为NULL, 则不编码了. */ se = calculate_huffman_codes( &sf ); root = sf[0]; // root来拉, 哈哈, 逻辑树出来了. /* * Scan the memory again and, using the table * previously built, encode it into the output memory. * 将se内的数据统统的写入到cache中克. */ rc = write_code_table_to_memory(&cache, se, symbol_count); if(rc == 0) { /* * 为什么write_code_table_to_memory成功之后还要要执行一次do_memory_encode? * */ rc = do_memory_encode(&cache, bufin, bufinlen, se); } /* Flush the cache. */ flush_cache( &cache ); /* Free the Huffman tree. */ free_huffman_tree( root ); free_encoder( se ); free_cache( &cache ); return rc; } /** * 对bufin进行解码. 将解码后的数据写入到bufout中. */ int huffman_decode_memory(const unsigned char *bufin, unsigned int bufinlen, unsigned char **pbufout, unsigned int *pbufoutlen) { huffman_node *root, *p; unsigned int data_count; unsigned int i = 0; unsigned char *buf; unsigned int bufcur = 0; /* Ensure the arguments are valid. */ if(!pbufout || !pbufoutlen) return 1; /* Read the Huffman code table. */ root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count); if(!root) return 1; buf = (unsigned char*)malloc(data_count); /* Decode the memory. */ p = root; for(; i < bufinlen && data_count > 0; ++i) { unsigned char byte = bufin[i]; unsigned char mask = 1; while(data_count > 0 && mask) { p = byte & mask ? p->one : p->zero; mask <<= 1; if(p->isLeaf) { buf[bufcur++] = p->symbol; p = root; --data_count; } } } free_huffman_tree(root); *pbufout = buf; *pbufoutlen = bufcur; return 0; } /* -------------------------------------------------------------------------------- 不妨假设待编码的buffer为 "ABCAADC" 手工分析可得该树的形状 : 当然也可以也可以将这个树沿y方向翻转180度 root // / / A * // / / C * // / / B D 现在我们知道的两个事实是 : 这个buffer内的字符数为 : symbol_count = 7 ( "ABCAADC"一个有7个字符 ) 这个buffer内出现的字符种数为 : count = 4 ( 只出现了ABCD四种字符 ) 接下来人工分析各个字符 : symbol | count | bits --------|-----------|--------------------- A | 3 | 0000 0000 B | 1 | 0000 0110 C | 2 | 0000 0010 D | 1 | 0000 0111 我们设置左边为0, 右边为1. bits为从叶子节点走到root的路径. 分析完毕后, 需要实现整个编码过程. 编码过程暂时跳过. 假设成功编码完毕, 需要把编码后的数据写入到bufout内. bufout内的 0-3个byte为字符种数 count 4-7个byte为字符个数 symbol_count 然后是遍历SymbolEncoder, 依次对每种字符进行编码(我们这个例子只进行4次编码) 我们对每种字符都会进行编码, 每个字符编码后的输出不妨称为frame 那么这个frame是由三个部分组成的: (这个我们可以肯定一个char肯定是1byte的) symbol (1byte) -- 字符 (这个我们可以肯定就算这个树根本没有分支, 永远只有左/右孩子, 那也了不起是是256的深度) numbits (1byte) -- 叶子走到root需要的步数 bits (1byte) -- 叶子走到root的方式(即最终的编码, 比如说011) 开始我对这个bites到底占了多少个byte很是怀疑, 因为我不知道从叶子走到root 到底耗费了几步. 这里需要好好研究一下, 最好和最差情况. 暂时假设是个变化的byte吧. 但是有一点bites跟numbits是有关系的, 所以只要知道numbits还是可以知道bites占据了 多少byte的, 也知道bits到底是有几位. byte content --------------------------------------------- 0-3(4) count 4-7(4) symbol_count A占xa个byte frame_struct B占xb个byte frame_struct C占xc个byte frame_struct D占xd个byte frame_struct X X 这个X是do_file_encode函数写到bufout中去的数据, 那么这个数据是什么呢 实际上它是循环的把出现的字符的bits写到bufout中, 根据这个数据,解码的时候就可以依次的找到第0,1,2...个位置出现的是什么字符了 -------------------------------------------------------------------------------- */ |
|