diff --git a/xmldumps-backup/mwbzutils/filter.c b/xmldumps-backup/mwbzutils/filter.c new file mode 100644 index 0000000..87a3bcf --- /dev/null +++ b/xmldumps-backup/mwbzutils/filter.c @@ -0,0 +1,537 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "bzlib.h" +/* + TODO: + see if all files end in with newline or without + stub files have it, page content don't. UGH. so this is for + page content files only, fine. +*/ + +typedef struct { + int check_header; + int check_footer; + int header_skipped; +} filter_t; + +typedef struct { + int fin; /* input file descriptor */ + int fout; /* output file descriptor */ + char *inbuf; /* input buffer for reads from input file */ + char *outbuf; /* output buffer for decompressed data from input buffer */ + int bufsize; + bz_stream *stream; + + /* total bytes read from input */ + unsigned int prev_total_read; + /* whether input file eof has been reached */ + int input_eof; + /* whether bz2 compressed stream has reached an end */ + int bz2_eof; + /* info about status of skipping header and footer in the data */ + filter_t *filter; +} decompr_t; + +void display_bytes(char *buffer, int length) { + int i=0; + + fprintf(stderr, "buffer contents (%d bytes): <<", length); + for (i=0; i>\n"); + fprintf(stderr, "buffer contents additional 70 bytes beyond: <<"); + for (i=length; i>\n"); +} + +void usage(char *message) { + char * help = +"Usage: filter --file |--help\n" +"Given a list of bz2-compressed files, uncompress and write the contents to a specified\n" +"file, or if none is specified, then to stdout,\n" +"removing headers and footers from files such that the final output has the header\n" +"from the first file and the footer from the last file.\n" +"Header is everything through '' tag, footer is tag plus the following newline.\n" +"Exits with 0 on success, 1 on error.\n\n" +"Flags:\n" +" -v, --verbose print the state of the bz2 stream buffer often\n" +" -h, --help Show this help message\n\n" +"Arguments:\n" +" -b, --bufsize size of input and output buffers\n" +" -i, --infiles comma-separated list of files to be uncompressed\n" +" -o, --outfile file to be written\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} + +void display_decompr(decompr_t *decompr, int verbose) { + int i=0; + int length = 0; + + if (verbose) { + fprintf(stderr, "bz2 stream info: next_in:%ld, avail_in:%d, total_in_hi32:%d, total_in_lo32:%d," + "next_out:%ld, avail_out:%d, total_out_hi32:%d, total_out_lo32:%d\n", + decompr->stream->next_in, decompr->stream->avail_in, decompr->stream->total_in_hi32, decompr->stream->total_in_lo32, + decompr->stream->next_out, decompr->stream->avail_out, decompr->stream->total_out_hi32, decompr->stream->total_out_lo32); + fprintf(stderr, "buffer info: inbuf:%ld, outbuf: %ld\n", decompr->inbuf, decompr->outbuf); + + length = decompr->stream->next_out - decompr->outbuf; + if (!length) { + fprintf(stderr, "No outbut buffer contents (length 0)\n"); + } + else { + display_bytes(decompr->outbuf, decompr->stream->next_out - decompr->outbuf); + } + } +} + +bz_stream *alloc_bz2_stream() { + bz_stream *bz2_strm = NULL; + + bz2_strm = (bz_stream *) malloc(sizeof(bz_stream)); + if (bz2_strm == NULL) { + fprintf(stderr," failed to allocate memory for bz2 stream\n"); + exit(1); + } + /* init the bz_stream structures; these mean use regular malloc/free */ + bz2_strm->bzalloc = NULL; + bz2_strm->bzfree = NULL; + bz2_strm->opaque = NULL; + return(bz2_strm); +} + +void init_files(char *infilename, char *outfilename, int *fin, int *fout) { + /* this function modifies *ifn and *fout! */ + + *fin = open(infilename, O_RDONLY); + if (*fin == -1) { + fprintf(stderr, "failed to open input file %s\n", infilename); + exit(1); + } + if (outfilename != NULL) { + *fout = open(outfilename, O_WRONLY|O_CREAT|O_TRUNC, 0644); + if (*fout == -1) { + fprintf(stderr, "failed to open output file %s\n", outfilename); + exit(1); + } + } + else { + *fout = 1; + } +} + +char *init_buf(int bufsize) { + char *buf = NULL; + + buf = (char *)malloc((size_t)bufsize); + if (buf == NULL) { + fprintf(stderr,"failed to allocate %d for buffer\n", bufsize); + exit(1); + } +} + +filter_t *alloc_filter() { + filter_t *filter_info = NULL; + filter_info = (filter_t *)malloc(sizeof(filter_t)); + if (filter_info == NULL) { + fprintf(stderr,"failed to allocate filter info struct\n"); + exit(1); + } + filter_info->check_header = 0; + filter_info->check_footer = 0; + filter_info->header_skipped = 0; + return filter_info; +} + +decompr_t *init_decompr(char *infile, char *outfile, int bufsize) { + char *inbuf = NULL; + char *outbuf = NULL; + bz_stream *bz2_strm = NULL; + int fin = 0; + int fout = 0; + + int bz2_verbosity = 0; + int bz2_small = 0; + + filter_t *filterinfo = NULL; + decompr_t *decomprinfo = NULL; + + bz2_strm = alloc_bz2_stream(); + BZ2_bzDecompressInit(bz2_strm, bz2_verbosity, bz2_small); + + inbuf = init_buf(bufsize); + outbuf = init_buf(bufsize); + init_files(infile, outfile, &fin, &fout); + + filterinfo = alloc_filter(); + + decomprinfo = (decompr_t *)malloc(sizeof(decompr_t)); + if (decomprinfo == NULL) { + fprintf(stderr,"failed to allocate %d for decompression info\n", sizeof(decompr_t)); + exit(1); + } + decomprinfo->inbuf = inbuf; + decomprinfo->outbuf = outbuf; + decomprinfo->fin = fin; + decomprinfo->fout = fout; + decomprinfo->stream = bz2_strm; + decomprinfo->bufsize = bufsize; + decomprinfo->prev_total_read = 0; + decomprinfo->input_eof = 0; + decomprinfo->bz2_eof = 0; + decomprinfo->filter = filterinfo; + return(decomprinfo); +} + +void reinit_decompr_stream(decompr_t *decompr) { + /* + if the decompr bz2 stream is set to NULL, + alloc a new one and init it; this also + includes resetting the pointers for data + being written FROM the stream only. + */ + + int bz2_verbosity = 0; + int bz2_small = 0; + bz_stream *bz2_strm = NULL; + + bz2_strm = alloc_bz2_stream(); + BZ2_bzDecompressInit(bz2_strm, bz2_verbosity, bz2_small); + + /* mark the entire output buffer as available for use */ + bz2_strm->next_out = decompr->outbuf; + /* num bytes available in which to place uncompressed data */ + bz2_strm->avail_out = decompr->bufsize; + + decompr->stream = bz2_strm; + decompr->bz2_eof = 0; +} + +int filter(int fout, char *data, int length, filter_t *filterinf, int last) { + /* + transform the available data as desired, write it + to the supplied file descriptor, move any unused bytes + of data to the beginning of the buffer, return the number + of bytes left unused + if 'last' is true we keep no bytes for next buffer but write them all, + since there will be no more buffers of data + */ + int unused = 0; + char *where = NULL; + int hdrmarklen = 12; + int ftrmarklen = 13; + + /* we are supposed to skip the header */ + if (filterinf->check_header) { + if (!filterinf->header_skipped) { + if (length < hdrmarklen) { + /* not enough contents to search in the buffer, just leave what's in there + in there to accumulate */ + return unused; + } + + /* length of data in buffer enough that we have to check its contents */ + + /* end of header found, we write only the stuff after it */ + where = memmem((const void *)data, (size_t)length, (const void *)"\n", (size_t)hdrmarklen); + if (where != NULL) { + write(fout, where + hdrmarklen, length - (where - data) - hdrmarklen); + filterinf->header_skipped++; + return unused; + } + else if (last) { + /* never did find the header end marker in this data. oh well. + toss this very last buffer too */ + return unused; + } + else { + unused = hdrmarklen; + /* there's no end of header marker here but the marker could be split over this and + the next buffer of data. write the last so many bytes to the beginning of the buffer + and report that */ + memmove((void *)data, (const void *)data + (length - unused), (size_t)unused); + return unused; + } + } + } + /* either we're not checking for the header at all. or we are checking but + we've already encountered it */ + if ((!filterinf->check_header || + (filterinf->check_header && filterinf->header_skipped)) && + filterinf->check_footer) { + /* not enough contents to search the buffer */ + if (length < ftrmarklen) { + /* no more data coming, write what we have, otherwise we want to keep + these measly bytes and stuff more things onto them so we can check for the footer*/ + if (last) { + write(fout, data, length); + } + return unused; + } + + /* it would be nice to be clever here and only check for the footer if we are near + the end of file, mmmmmeeeeeeehhhh + this would entail knowing where we are in the file, (can be done) + and knowing how big the file is (one stat at some point could help) + assuming that a stat can be done and etc. for now FIXME always do the check + */ + /* footer. we only check for it at the end of the buffer. */ + if (!strncmp(data + length - ftrmarklen, "\n", (size_t)ftrmarklen)) { + write(fout, data, length - ftrmarklen); + memmove((void *)data, (const void *)data + (length - unused), (size_t)unused); + return unused; + } + else if (last) { + /* no footer. no more data coming. write it all. */ + write(fout, data, length - unused); + return unused; + } + else { + /* there's no footer marker here but the marker could be split over this and + the next buffer of data. write the last so many bytes to the beginning of the buffer + and report that */ + unused = ftrmarklen; + write(fout, data, length - unused); + memmove((void *)data, (const void *)data + (length - unused), (size_t)unused); + return unused; + } + } + else { + /* we love the whole buffer, use it */ + write(fout, data, length); + return unused; + } +} + +void consume_data(decompr_t *decompressor, int last) { + int unused = 0; + + /* the only way to know if decompressed bytes were put into + the output buffer: compare the total bytes written + now (low 32 bits) to the total written before the decompress call + it's ok to compare only the lower 32 bits, we're not going to + read 2<<32 bytes of data in one block */ + if (decompressor->stream->total_out_lo32 - decompressor->prev_total_read) { + /* there is data available, so write it */ + unused = filter(decompressor->fout, decompressor->outbuf, + decompressor->bufsize - decompressor->stream->avail_out, + decompressor->filter, last); + + /* keep track of total bytes read so far (low 32 bits), so we can compare after the next read */ + decompressor->prev_total_read = decompressor->stream->total_out_lo32; + if (unused) { + decompressor->stream->next_out = decompressor->outbuf + unused; + decompressor->stream->avail_out = decompressor->bufsize - unused; + } + else { + /* mark the entire output buffer as available for use */ + decompressor->stream->next_out = decompressor->outbuf; + decompressor->stream->avail_out = decompressor->bufsize; + } + } +} + +void bz2decompr(decompr_t *decompressor, int verbose) { + int res; + + display_decompr(decompressor, verbose); + res = BZ2_bzDecompress(decompressor->stream); + display_decompr(decompressor, verbose); + + consume_data(decompressor, res == BZ_STREAM_END); + if (res == BZ_STREAM_END) { + res = BZ2_bzDecompressEnd(decompressor->stream); + if (res != BZ_OK) { + fprintf(stderr, "failed to free bz2 stream resources\n"); + } + decompressor->stream = NULL; + decompressor->bz2_eof = 1; + } + else if (res < 0) { + fprintf(stderr, "error %d after BZ2_bzDecompress\n", res); + decompressor->stream = NULL; + decompressor->bz2_eof = 1; + } +} + +void init_bz2_stream(bz_stream *stream, char *inbuf, char *outbuf, int bufsize) { + /* next unread byte of input, + may be updated by bz2 lib to location after end of input buffer */ + stream->next_in = inbuf; + /* num compressed bytes unread */ + stream->avail_in = 0; + /* next address where decompressed bytes are to be placed, + may be updated by bz2 lib to location after end of output buffer */ + stream->next_out = outbuf; + /* num bytes available in which to place uncompressed data */ + stream->avail_out = bufsize; +} + +int get_input(decompr_t *decompressor) { + if (decompressor->input_eof) return 0; + + if (!decompressor->stream->avail_in) { + /* all the buffer is used, start over */ + decompressor->stream->next_in = decompressor->inbuf; + } + return read(decompressor->fin, decompressor->stream->next_in, + decompressor->bufsize - decompressor->stream->avail_in); +} + +void handle_extra_input(int multistream, decompr_t *decompressor, int bytes_read) { + if (multistream) { + /* multistream is ok */ + /* can we be losing some of what is in the stream for decompression somehow? CHECK FIXME */ + reinit_decompr_stream(decompressor); + } + else if (bytes_read) { + /* bz2 decompress found end of stream. this means there + is garbage data after the stream end. whine and refuse to proceed. */ + if (decompressor->bz2_eof) { + fprintf(stderr, "garbage data found after end of bz2 stream.\n"); + close(decompressor->fin); + if (decompressor->fout != 1) + close(decompressor->fout); + exit(1); + } + } +} + +void decompress_data(decompr_t *decompressor, int bytes_read, int verbose) { + if (bytes_read) { + /* input avail to bz2: what we read plus what it left from + the previous decompression */ + decompressor->stream->avail_in += bytes_read; + bz2decompr(decompressor, verbose); + } + else { + /* no bytes read, but there may be bytes in decompression stream + to process */ + decompressor->input_eof = 1; + bz2decompr(decompressor, verbose); + } +} + +int decompress(char *infilename, char *outfilename, int bufsize, int multistream, + int skiphdr, int skipftr, int verbose) { + decompr_t *decompressor = NULL; + int bytes_read = 0; + + decompressor = init_decompr(infilename, outfilename, bufsize); + init_bz2_stream(decompressor->stream, decompressor->inbuf, + decompressor->outbuf, decompressor->bufsize); + + decompressor->filter->check_header = skiphdr; + decompressor->filter->check_footer = skipftr; + + /* process all the input */ + /* while (!decompressor->input_eof) { */ + while (!(decompressor->input_eof && decompressor->bz2_eof)) { + printf("while loop\n"); + display_decompr(decompressor, verbose); + bytes_read = get_input(decompressor); + printf("bytes read is %d\n", bytes_read); + handle_extra_input(multistream, decompressor, bytes_read); + printf("before decompress of data\n"); + display_decompr(decompressor, verbose); + decompress_data(decompressor, bytes_read, verbose); + printf("input_eof is %d, bz2_eof is %d\n", decompressor->input_eof, decompressor->bz2_eof); + } + + close(decompressor->fin); + if (decompressor->fout != 1) + close(decompressor->fout); +} + +void write_footer(char *filename) { + int fout = 0; + + if (filename != NULL) { + fout = open(filename, O_WRONLY|O_APPEND); + if (fout == -1) { + fprintf(stderr, "failed to open output file %s to write footer\n", filename); + exit(1); + } + } + else { + fout = 1; + } + write(fout, "\n", 13); + if (fout != 1) + close(fout); +} + +int main(int argc, char **argv) { + char *infiles = NULL; + char *outfile = NULL; + int verbose = 0; + int multistream = 0; + int bufsize = 4096; + int res = 0; + + int skiphdr = 0; + int skipftr = 1; + + int optc; + int optindex = 0; + + char *oldfilename = NULL; + char *filename = NULL; + + struct option optvalues[] = { + {"help", 0, 0, 'h'}, + {"infiles", 1, 0, 'i'}, + {"outfile", 1, 0, 'o'}, + {"bufsize", 1, 0, 'b'}, + {"multistream", 0, 0, 'm'}, + {"verbose", 0, 0, 'v'}, + {NULL, 0, NULL, 0} + }; + + while (1) { + optc=getopt_long_only(argc,argv,"i:o:b:mhv", optvalues, &optindex); + if (optc == 'h') + usage(NULL); + else if (optc == 'v') + verbose++; + else if (optc == 'i') + infiles = optarg; + else if (optc == 'o') + outfile = optarg; + else if (optc == 'b') + bufsize = strtol(optarg, NULL, 10); + else if (optc == 'm') + multistream = 1; + else if (optc == -1) break; + else usage("Unknown option or other error\n"); + } + + if (infiles == NULL) { + usage("Missing infiles argument."); + } + + while ((filename = strsep(&infiles, ","))) { + res = decompress(filename, outfile, bufsize, multistream, skiphdr, skipftr, verbose); + if (res) { + exit(res); + } + /* we wrote the first header, we hope, assuming the first file had a header. + don't write any more. */ + skiphdr = 1; + } + write_footer(outfile); + exit(0); +}