diff --git a/xmldumps-backup/mwbzutils/Makefile b/xmldumps-backup/mwbzutils/Makefile index d8947b8..aa215c6 100644 --- a/xmldumps-backup/mwbzutils/Makefile +++ b/xmldumps-backup/mwbzutils/Makefile @@ -1,170 +1,170 @@ # ------------------------------------------------------------------ # This Makefile builds binaries which rely on three source files # from libbzip2 version 1.0.6. (See bz2libfuncs.c, bzlib.h and # bzlib_private.h; the first is slightly modified while the # second is unchanged from the library version.) # # The copyright for those two files is as follows: # # bzip2/libbzip2 version 1.0.6 of 6 September 2010 # Copyright (C) 1996-2010 Julian Seward # # Those files are released under the terms of the license contained # in the file LICENSE_BZ. # # All other files are released under the GPL, copyright (C) Ariel T. Glenn # 2010-2013: see the file COPYING for details. # ------------------------------------------------------------------ VERSION = "0.0.8" CC ?= gcc BIGFILES = -D_FILE_OFFSET_BITS=64 CPPFLAGS += $(BIGFILES) -DVERSION=\"$(VERSION)\" CFLAGS ?= -Wall -Werror -O2 build: checkforbz2footer dumpbz2filefromoffset \ dumplastbz2block findpageidinbz2xml \ recompressxml writeuptopageid compressedmanpages \ getlastidinbz2xml NAME_CHECKFORBZ2FOOTER = "Check if bzip2 file ends with bz2 magic footer" NAME_DUMPBZ2FILEFROMOFFSET = "Write MediaWiki XML pages from bzip2 file starting from offset" NAME_DUMPLASTBZ2BLOCK = "Find last bz2 block in bzip2 file and dump contents" NAME_FINDPAGEIDINBZ2XML = "Display offset of bz2 block for given page id in bzip2 MediaWiki XML file" NAME_FINDLASTPAGEIDINBZ2XML = "Display last page id bzip2 MediaWiki XML file" NAME_RECOMPRESSXML = "Bz2 compress MediaWiki XML input in batches of pages" NAME_WRITEUPTOPAGEID = "Write range of page content from MediaWiki XML input" PREFIX ?= "/usr/local" BINDIR = $(DESTDIR)$(PREFIX)/bin/ MANDIR = $(DESTDIR)$(PREFIX)/share/man/man1/ DOCDIR = $(DESTDIR)$(PREFIX)/share/doc/mwbzutils/ GZIP = /bin/gzip HELP2MAN = /usr/bin/help2man SHELL = /bin/sh DISTNAME = mwbzutils-$(VERSION) LIBS = -lbz2 OBJSBZ = bzlibfuncs.o OBJS = mwbzlib.o $(OBJSBZ) checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o $(CC) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o $(OBJS) $(LIBS) dumpbz2filefromoffset: $(OBJSBZ) mwbzlib.o dumpbz2filefromoffset.o $(CC) $(LDFLAGS) -o dumpbz2filefromoffset dumpbz2filefromoffset.o $(OBJS) $(LIBS) dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o $(CC) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o $(OBJS) $(LIBS) findpageidinbz2xml: $(OBJSBZ) mwbzlib.o httptiny.o findpageidinbz2xml.o $(CC) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o httptiny.o $(OBJS) $(LIBS) -lz getlastidinbz2xml: $(OBJSBZ) mwbzlib.o getlastidinbz2xml.o $(CC) $(LDFLAGS) -o getlastidinbz2xml getlastidinbz2xml.o $(OBJS) $(LIBS) -recompressxml: $(OBJSBZ) recompressxml.o - $(CC) $(LDFLAGS) -o recompressxml recompressxml.o $(LIBS) +recompressxml: $(OBJSBZ) iohandlers.o recompressxml.o + $(CC) $(LDFLAGS) -o recompressxml iohandlers.o recompressxml.o $(LIBS) -lz writeuptopageid: $(OBJSBZ) iohandlers.o writeuptopageid.o $(CC) $(LDFLAGS) -o writeuptopageid iohandlers.o writeuptopageid.o $(LIBS) -lz %.o: %.c $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ compressedmanpages: docs/dumplastbz2block.1.gz docs/findpageidinbz2xml.1.gz \ docs/checkforbz2footer.1.gz docs/dumpbz2filefromoffset.1.gz \ docs/recompressxml.1.gz docs/writeuptopageid.1.gz docs/%.1.gz: docs/%.1 cat $< | $(GZIP) > $@ # this target should only be made when updating the source if the version # or the usage mssages change manpages: dumplastbz2block.1 findpageidinbz2xml.1 \ checkforbz2footer.1 dumpbz2filefromoffset.1 \ recompressxml.1 writeuptopageid.1 \ getlastidinbz2xml.1 echo "Don't forget to commit your manpage changes to the repo" checkforbz2footer.1 : checkforbz2footer $(HELP2MAN) --section 1 --no-info --name $(NAME_CHECKFORBZ2FOOTER) \ --no-discard-stderr ./checkforbz2footer > docs/checkforbz2footer.1 dumpbz2filefromoffset.1 : dumpbz2filefromoffset $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPBZ2FILEFROMOFFSET) \ --no-discard-stderr ./dumpbz2filefromoffset > docs/dumpbz2filefromoffset.1 dumplastbz2block.1 : dumplastbz2block $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPLASTBZ2BLOCK) \ --no-discard-stderr ./dumplastbz2block > docs/dumplastbz2block.1 findpageidinbz2xml.1 : findpageidinbz2xml $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \ --no-discard-stderr ./findpageidinbz2xml > docs/findpageidinbz2xml.1 getlastidinbz2xml.1 : getlastidinbz2xml $(HELP2MAN) --section 1 --no-info --name $(NAME_GETLASTIDINBZ2XML) \ --no-discard-stderr ./getlastidinbz2xml > docs/getlastidinbz2xml.1 recompressxml.1 : recompressxml $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \ --no-discard-stderr ./recompressxml > docs/recompressxml.1 writeuptopageid.1 : writeuptopageid $(HELP2MAN) --section 1 --no-info --name $(NAME_WRITEUPTOPAGEID) \ --no-discard-stderr ./writeuptopageid > docs/writeuptopageid.1 install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset \ recompressxml writeuptopageid compressedmanpages getlastidinbz2xml install --directory $(BINDIR) install --mode=755 checkforbz2footer $(BINDIR) install --mode=755 dumplastbz2block $(BINDIR) install --mode=755 dumpbz2filefromoffset $(BINDIR) install --mode=755 findpageidinbz2xml $(BINDIR) install --mode=755 getlastidinbz2xml $(BINDIR) install --mode=755 recompressxml $(BINDIR) install --mode=755 writeuptopageid $(BINDIR) install --directory $(MANDIR) install --mode=644 docs/*.1.gz $(MANDIR) install --directory $(DOCDIR) install --mode=644 README $(DOCDIR) install --mode=644 LICENSE_BZ $(DOCDIR) install --mode=644 COPYING $(DOCDIR) uninstall: rm -f $(BINDIR)dumplastbz2block rm -f $(BINDIR)findpageidinbz2xml rm -f $(BINDIR)getlastidinbz2xml rm -f $(BINDIR)checkforbz2footer rm -f $(BINDIR)dumpbz2filefromoffset rm -f $(BINDIR)recompressxml rm -f $(BINDIR)writeuptopageid rm -f $(MANDIR)*.1.gz rm -f $(DOCDIR)README rm -f $(DOCDIR)LICENSE_BZ rm -f $(DOCDIR)COPYING clean: rm -f *.o *.a dumplastbz2block findpageidinbz2xml \ getlastidinbz2xml \ checkforbz2footer dumpbz2filefromoffset \ recompressxml writeuptopageid docs/*.1.gz distclean: clean rm -f $(DISTNAME) rm -f *.tar.gz reallyclean: distclean rm -f docs/*.1 dist: rm -f $(DISTNAME) ln -s -f . $(DISTNAME) tar cvf $(DISTNAME).tar \ $(DISTNAME)/*.c \ $(DISTNAME)/*.h \ $(DISTNAME)/Makefile \ $(DISTNAME)/LICENSE_BZ \ $(DISTNAME)/COPYING \ $(DISTNAME)/README \ $(DISTNAME)/CHANGES \ $(DISTNAME)/docs/*1 gzip -v $(DISTNAME).tar diff --git a/xmldumps-backup/mwbzutils/iohandlers.c b/xmldumps-backup/mwbzutils/iohandlers.c index 6c7f558..41da014 100644 --- a/xmldumps-backup/mwbzutils/iohandlers.c +++ b/xmldumps-backup/mwbzutils/iohandlers.c @@ -1,337 +1,502 @@ #include #include #include #include #include #include #include #include #include #include "iohandlers.h" void free_bz2buf(bz2buffer_t *b) { if (b) free(b); return; } bz2buffer_t *init_bz2buf() { bz2buffer_t *buf; buf = (bz2buffer_t *) malloc(sizeof(bz2buffer_t)); if (!buf) { fprintf(stderr,"failed to get memory for bz2 input buffer\n"); return(NULL); } buf->nextin = buf->nextout = buf->bytes_avail = 0; return(buf); } InputHandler *inputhandler_init(char *path) { InputHandler *ih = NULL; ih = (InputHandler *)malloc(sizeof(InputHandler)); if (ih == NULL) { fprintf(stderr, "failed to allocate input handler\n"); exit(1); } ih->path = path; ih->fin = NULL; ih->bz_buffer = init_bz2buf(); ih->bzstream = NULL; ih->bzerror = 0; ih->bz_verbosity = 0; /* no extra messages */ ih->bz_small = 0; /* don't try to save memory */ ih->bz_unused = NULL; ih->bz_nUnused = 0; ih->gzstream = NULL; ih->gz_bufsize = 65536; if (path == NULL) { ih->fin = stdin; ih->open = NULL; ih->fgets = txt_fgets_i; ih->close = NULL; + ih->eof = txt_eof_i; return(ih); } if (strlen(path) > 4 && !strcmp(path + strlen(path) - 4, ".bz2")) { ih->open = bz2_open_i; ih->fgets = bz2_fgets_i; ih->close = bz2_close_i; + ih->eof = bz2_eof_i; } else if (strlen(path) > 3 && !strcmp(path + strlen(path) - 3, ".gz")) { ih->open = gz_open_i; ih->fgets = gz_fgets_i; ih->close = gz_close_i; + ih->eof = gz_eof_i; } else { ih->open = txt_open_i; ih->fgets = txt_fgets_i; ih->close = txt_close_i; + ih->eof = txt_eof_i; } return(ih); } int isfull(bz2buffer_t *buf) { if (buf->nextin == sizeof(buf->buf)) return(1); else return(0); } -void fill_buffer(bz2buffer_t *buf, BZFILE *fd) { +int isempty(bz2buffer_t *buf) { + if (buf->bytes_avail == 0) return(1); + return(0); +} + +int fill_buffer(bz2buffer_t *buf, BZFILE *fd) { int result; - if (isfull(buf)) return; + if (isfull(buf)) return(1); result = BZ2_bzread(fd, buf->buf + buf->nextin, sizeof(buf->buf) - buf->nextin); - if (result) { + if (result > 0) { buf->nextin += result; buf->bytes_avail += result; } - return; + return result; } int has_newline(bz2buffer_t *buf) { int ind = 0; while (ind < buf->bytes_avail) { if (buf->buf[buf->nextout+ind] == '\n') return(ind+1); ind++; } return(-1); } /* returns: pointer to the output buffer if any data was read and copied NULL otherwise this function will read one line of output from file and copy it into out, at most out_size -1 bytes are copied, a '\0' will be placed at the end, if no input is copied the holder will contain the empty string */ char *bz2gets(BZFILE *fd, bz2buffer_t *buf, char *out, int out_size) { int newline_ind = -1; int out_ind = 0; int out_space_remaining = out_size -1; + int result = 0; out[0]='\0'; - if (!buf->bytes_avail) fill_buffer(buf, fd); if (!buf->bytes_avail) { - return(0); + result = fill_buffer(buf, fd); + if (result < 0) + return(NULL); + } + if (!buf->bytes_avail) { + return(NULL); } while (((newline_ind = has_newline(buf)) == -1) && (out_space_remaining > buf->bytes_avail)) { strncpy(out+out_ind, buf->buf + buf->nextout, buf->bytes_avail); out_ind += buf->bytes_avail; out[out_ind] = '\0'; out_space_remaining -= buf->bytes_avail; buf->nextout = buf->nextin = buf->bytes_avail = 0; - fill_buffer(buf, fd); + result = fill_buffer(buf, fd); + if (result < 0) + return(NULL); if (!buf->bytes_avail) { out[out_ind] = '\0'; if (out_ind) return(out); else return(NULL); } } if (out_space_remaining) { if (newline_ind >=0 && newline_ind < out_space_remaining) { strncpy(out+out_ind, buf->buf + buf->nextout, newline_ind); out_ind += newline_ind; out[out_ind] = '\0'; buf->nextout += newline_ind; buf->bytes_avail -= (newline_ind); } else { strncpy(out+out_ind, buf->buf + buf->nextout, out_space_remaining); out_ind+= out_space_remaining; out[out_ind] = '\0'; buf->nextout += out_space_remaining; buf->bytes_avail -= out_space_remaining; } /* if the buffer is empty set things up correctly for that case */ if (buf->nextout == sizeof(buf->buf) && !buf->bytes_avail) { buf->nextout = 0; buf->nextin = 0; } } out[out_ind] = '\0'; if (!out_ind) return(NULL); else return(out); } int bz2_open_i(InputHandler *ih) { if (ih->path != NULL) { ih->fin = fopen(ih->path, "rb"); - /* fixme check if successfull */ + if (!ih->fin) { + fprintf(stderr, "failed to open input file for read\n"); + exit(-1); + } } ih->bzstream = BZ2_bzReadOpen(&(ih->bzerror), ih->fin, ih->bz_verbosity, ih->bz_small, ih->bz_unused, ih->bz_nUnused); if (ih->bzerror != BZ_OK) { fprintf(stderr, "error %d trying to open %s for decompression\n", ih->bzerror, ih->path); exit(-1); } return(1); } char *bz2_fgets_i(InputHandler *ih, char *buffer, int bytecount) { char *ret = NULL; ret = bz2gets(ih->bzstream, ih->bz_buffer, buffer, bytecount); if (ret == NULL && ih->bzerror != BZ_OK) { fprintf(stderr, "error %d trying to read from %s\n", ih->bzerror, ih->path); return(NULL); } return(ret); } int bz2_close_i(InputHandler *ih) { BZ2_bzReadClose(&(ih->bzerror), ih->bzstream); if (ih->fin != stdout) fclose(ih->fin); return(0); } +int bz2_eof_i(InputHandler *ih) { + if (feof(ih->fin) && isempty(ih->bz_buffer)) + return(1); + return(0); +} + int gz_open_i(InputHandler *ih) { if (ih->path != NULL) { ih->gzstream = gzopen(ih->path, "rb"); gzbuffer(ih->gzstream, ih->gz_bufsize); } + if (!ih->gzstream) { + fprintf(stderr, "error trying to open %s for decompression\n", + ih->path); + exit(-1); + } return(0); } char *gz_fgets_i(InputHandler *ih, char *buffer, int bytecount) { return(gzgets(ih->gzstream, buffer, bytecount)); } int gz_close_i(InputHandler *ih) { if (ih->path != NULL) gzclose(ih->gzstream); return(0); } +int gz_eof_i(InputHandler *ih) { + if (gzeof(ih->gzstream)) + return(1); + return(0); +} + int txt_open_i(InputHandler *ih) { - if (ih->path != NULL) + if (ih->path != NULL) { ih->fin = fopen(ih->path, "r"); + if (!ih->fin) { + fprintf(stderr, "failed to open %s for decompression\n", ih->path); + exit(-1); + } + } return(0); } char *txt_fgets_i(InputHandler *ih, char *buffer, int bytecount) { return(fgets(buffer, bytecount, ih->fin)); } int txt_close_i(InputHandler *ih) { if (ih->path != NULL) fclose(ih->fin); return(0); } +int txt_eof_i(InputHandler *ih) { + if (feof(ih->fin)) + return(1); + return(0); +} + OutputHandler *outputhandler_init(char *path) { OutputHandler *oh = NULL; + char *dotPosition = NULL; + oh = (OutputHandler *)malloc(sizeof(OutputHandler)); if (oh == NULL) { fprintf(stderr, "failed to allocate output handler\n"); exit(1); } + oh->open = NULL; + oh->path = path; oh->fout = NULL; oh->bzstream = NULL; oh->bzerror = 0; oh->bz_blocksize = 9; /* 900k */ oh->bz_verbosity = 0; /* no extra messages */ oh->bz_workfactor = 0; /* use the default */ oh->gzstream = NULL; oh->gz_bufsize = 65536; + oh->bytes_in_low = 0; + oh->bytes_in_hi = 0; + oh->bytes_out_low = 0; + oh->bytes_out_hi = 0; + + oh->bytes_in_low_cumul = 0; + oh->bytes_in_hi_cumul = 0; + oh->bytes_out_low_cumul = 0; + oh->bytes_out_hi_cumul = 0; + + oh->offset_gz = 0; + oh->offset_txt = 0; + + oh->closed = 1; + if (path == NULL) { oh->fout = stdout; oh->open = NULL; oh->write = txt_write_o; oh->close = NULL; return(oh); } if (strlen(path) > 4 && !strcmp(path + strlen(path) - 4, ".bz2")) { oh->open = bz2_open_o; oh->write = bz2_write_o; oh->close = bz2_close_o; } else if (strlen(path) > 3 && !strcmp(path + strlen(path) - 3, ".gz")) { oh->open = gz_open_o; oh->write = gz_write_o; oh->close = gz_close_o; } else { - /* - only for DEBUG perf testing + dotPosition = strrchr(path, '.'); + if (dotPosition != NULL) { + *dotPosition = '\0'; + if (strlen(path) > 4 && !strcmp(path+(strlen(path)-4),".bz2")) { + /* filename ends in .bz2.something */ + oh->open = bz2_open_o; + oh->write = bz2_write_o; + oh->close = bz2_close_o; + } + else if (strlen(path) > 3 && !strcmp(path+(strlen(path)-3),".gz")) { + /* filename ends in .gz.something */ + oh->open = gz_open_o; + oh->write = gz_write_o; + oh->close = gz_close_o; + } + *dotPosition = '.'; + } + } - oh->open = gz_open_o; - oh->write = gz_write_o; - oh->close = gz_close_o; - */ + if (oh->open == NULL) { + /* not set in stanzas above */ oh->open = txt_open_o; oh->write = txt_write_o; oh->close = txt_close_o; } return(oh); } int bz2_open_o(OutputHandler *oh) { oh->fout = fopen(oh->path, "w"); oh->bzstream = BZ2_bzWriteOpen(&(oh->bzerror), oh->fout, oh->bz_blocksize, oh->bz_verbosity, oh->bz_workfactor); if (oh->bzerror != BZ_OK) { fprintf(stderr, "error %d trying to open %s for compression\n", oh->bzerror, oh->path); exit(-1); } + oh->closed = 0; return(1); } +int bz2_open_a(OutputHandler *oh) { + if (oh->path != NULL) + oh->fout = fopen(oh->path, "a"); + + oh->bzstream = BZ2_bzWriteOpen(&(oh->bzerror), oh->fout, oh->bz_blocksize, + oh->bz_verbosity, oh->bz_workfactor); + if (oh->bzerror != BZ_OK) { + fprintf(stderr, "error %d trying to open %s for compression\n", + oh->bzerror, oh->path); + exit(-1); + } + oh->closed = 0; + return(1); +} + +void outputhandler_appendmode(OutputHandler *oh) { + char *dotPosition = NULL; + + oh->open = NULL; + + if (strlen(oh->path) > 4 && !strcmp(oh->path + strlen(oh->path) - 4, ".bz2")) + oh->open = bz2_open_a; + else if (strlen(oh->path) > 3 && !strcmp(oh->path + strlen(oh->path) - 3, ".gz")) + oh->open = gz_open_a; + else { + dotPosition = strrchr(oh->path, '.'); + if (dotPosition != NULL) { + *dotPosition = '\0'; + if (strlen(oh->path) > 4 && !strcmp(oh->path+(strlen(oh->path)-4),".bz2")) { + /* filename ends in .bz2.something */ + oh->open = bz2_open_a; + } + else if (strlen(oh->path) > 3 && !strcmp(oh->path+(strlen(oh->path)-3),".gz")) { + /* filename ends in .gz.something */ + oh->open = gz_open_a; + } + *dotPosition = '.'; + } + } + + if (oh->open == NULL) { + /* not set in stanzas above */ + oh->open = txt_open_a; + } +} + int bz2_write_o(OutputHandler *oh, char *buffer, int bytecount) { BZ2_bzWrite(&(oh->bzerror), oh->bzstream, buffer, bytecount); if (oh->bzerror != BZ_OK) { fprintf(stderr, "error %d trying to write to %s\n", oh->bzerror, oh->path); return(0); } return(bytecount); } int bz2_close_o(OutputHandler *oh) { - unsigned int bytes_in; - unsigned int bytes_out; - - BZ2_bzWriteClose(&(oh->bzerror), oh->bzstream, 0, &bytes_in, &bytes_out); - fclose(oh->fout); + BZ2_bzWriteClose64(&(oh->bzerror), oh->bzstream, 0, + &(oh->bytes_in_low), &(oh->bytes_in_hi), + &(oh->bytes_out_low), &(oh->bytes_out_hi)); + oh->bytes_in_low_cumul += oh->bytes_in_low; + oh->bytes_in_hi_cumul += oh->bytes_in_hi; + oh->bytes_out_low_cumul += oh->bytes_out_low; + oh->bytes_out_hi_cumul += oh->bytes_out_hi; + + if (oh->fout != stdout) + fclose(oh->fout); + oh->closed = 1; return(0); } int gz_open_o(OutputHandler *oh) { oh->gzstream = gzopen(oh->path, "w"); gzbuffer(oh->gzstream, oh->gz_bufsize); + oh->closed = 0; + return(0); +} + +int gz_open_a(OutputHandler *oh) { + oh->gzstream = gzopen(oh->path, "a"); + gzbuffer(oh->gzstream, oh->gz_bufsize); + oh->closed = 0; return(0); } int gz_write_o(OutputHandler *oh, char *buffer, int bytecount) { return(gzwrite(oh->gzstream, buffer, bytecount)); } int gz_close_o(OutputHandler *oh) { + gzflush(oh->gzstream, Z_FINISH); + oh->offset_gz = gzoffset(oh->gzstream); gzclose(oh->gzstream); + oh->closed = 1; return(0); } int txt_open_o(OutputHandler *oh) { oh->fout = fopen(oh->path, "w"); + oh->closed = 0; + return(0); +} + +int txt_open_a(OutputHandler *oh) { + oh->fout = fopen(oh->path, "a"); + oh->closed = 0; return(0); } int txt_write_o(OutputHandler *oh, char *buffer, int bytecount) { - return(fwrite(buffer, 1, bytecount, oh->fout)); + size_t results; + + results = fwrite(buffer, 1, bytecount, oh->fout); + oh->offset_txt += results; + return(0); } int txt_close_o(OutputHandler *oh) { fclose(oh->fout); + oh->closed = 1; return(0); } + +off_t outputhandler_get_offset(OutputHandler *oh) { + if (oh->gzstream) + return oh->offset_gz; + else if (oh->bzstream) + return ((off_t)oh->bytes_out_low_cumul | ((off_t)oh->bytes_out_hi_cumul << 32)); + else + return oh->offset_txt; +} diff --git a/xmldumps-backup/mwbzutils/iohandlers.h b/xmldumps-backup/mwbzutils/iohandlers.h index 889e442..c1f72e2 100644 --- a/xmldumps-backup/mwbzutils/iohandlers.h +++ b/xmldumps-backup/mwbzutils/iohandlers.h @@ -1,95 +1,124 @@ #ifndef _IOHANDLERS_H #define _IOHANDLERS_H #include #include #include #include #include #include #include #include #include typedef struct { char buf[65536]; int nextin; /* pointer to next byte available for reading stuff in from file */ int nextout; /* pointer to next byte available for consumption by caller */ int bytes_avail; /* number of bytes avail for consumption */ } bz2buffer_t; typedef struct { char *path; FILE *fin; BZFILE *bzstream; gzFile gzstream; int gz_bufsize; int bzerror; int bz_verbosity; int bz_small; char *bz_unused; int bz_nUnused; bz2buffer_t *bz_buffer; int (*open)(); char *(*fgets)(); int (*close)(); + int (*eof)(); } InputHandler; int bz2_open_i(InputHandler *ih); char *bz2_fgets_i(InputHandler *ih, char *buffer, int bytecount); int bz2_close_i(InputHandler *ih); +int bz2_eof_i(InputHandler *ih); int gz_open_i(InputHandler *ih); char *gz_fgets_i(InputHandler *ih, char *buffer, int bytecount); int gz_close_i(InputHandler *ih); +int gz_eof_i(InputHandler *ih); int txt_open_i(InputHandler *ih); char *txt_fgets_i(InputHandler *ih, char *buffer, int bytecount); int txt_close_i(InputHandler *ih); +int txt_eof_i(InputHandler *ih); void free_bz2buf(bz2buffer_t *b); bz2buffer_t *init_bz2buf(); InputHandler *inputhandler_init(char *path); int isfull(bz2buffer_t *buf); -void fill_buffer(bz2buffer_t *buf, BZFILE *fd); +int isempty(bz2buffer_t *buf); +int fill_buffer(bz2buffer_t *buf, BZFILE *fd); int has_newline(bz2buffer_t *buf); char *bz2gets(BZFILE *fd, bz2buffer_t *buf, char *out, int out_size); typedef struct { FILE *fin; char *path; FILE *fout; BZFILE *bzstream; gzFile gzstream; int gz_bufsize; int bzerror; int bz_blocksize; int bz_verbosity; int bz_workfactor; + /* offset trackers for bz2 files */ + unsigned int bytes_in_low; + unsigned int bytes_in_hi; + unsigned int bytes_out_low; + unsigned int bytes_out_hi; + + unsigned int bytes_in_low_cumul; + unsigned int bytes_in_hi_cumul; + unsigned int bytes_out_low_cumul; + unsigned int bytes_out_hi_cumul; + + /* offset trackers for gz files */ + z_off_t offset_gz; + + /* offset trackers for plain text files */ + size_t offset_txt; + + int closed; + int (*open)(); int (*write)(); int (*close)(); } OutputHandler; int bz2_open_o(OutputHandler *oh); +int bz2_open_a(OutputHandler *oh); int bz2_write_o(OutputHandler *oh, char *buffer, int bytecount); int bz2_close_o(OutputHandler *oh); int gz_open_o(OutputHandler *oh); +int gz_open_a(OutputHandler *oh); int gz_write_o(OutputHandler *oh, char *buffer, int bytecount); int gz_close_o(OutputHandler *oh); int txt_open_o(OutputHandler *oh); +int txt_open_a(OutputHandler *oh); int txt_write_o(OutputHandler *oh, char *buffer, int bytecount); int txt_close_o(OutputHandler *oh); OutputHandler *outputhandler_init(char *path); +void outputhandler_appendmode(OutputHandler *oh); +off_t outputhandler_get_offset(OutputHandler *oh); #endif diff --git a/xmldumps-backup/mwbzutils/recompressxml.c b/xmldumps-backup/mwbzutils/recompressxml.c index 2bd0844..4501e77 100644 --- a/xmldumps-backup/mwbzutils/recompressxml.c +++ b/xmldumps-backup/mwbzutils/recompressxml.c @@ -1,409 +1,366 @@ #include #include #include #include #include #include #include #include #include #include #include #include #include +#include "iohandlers.h" #include "bzlib.h" char inBuf[4096]; char outBuf[8192]; char inBuf_indx[4096]; char outBuf_indx[8192]; char *pageOpenTag = "\n"; char *pageTitleExpr = "(.+)\n"; regmatch_t *matchPageTitleExpr; regex_t compiledMatchPageTitleExpr; char *idExpr = "([0-9]+)\n"; regmatch_t *matchIdExpr; regex_t compiledMatchIdExpr; -bz_stream strm_indx; - void usage(char *message) { char * help = "Usage: recompressxml --pagesperstream n [--buildindex filename] [--verbose]\n" " or: recompressxml [--version|--help]\n\n" "Reads a stream of XML pages from stdin and writes to stdout the bz2 compressed\n" "data, one bz2 stream (header, blocks, footer) per specified number of pages.\n\n" "Options:\n\n" " -p, --pagesperstream: Compress this number of pages in each complete\n" " bz2stream before opening a new stream. The siteinfo\n" " header is written to a separate stream at the beginning\n" " of all output, and the closing mediawiki tag is written\n" " into a separate stream at the end.\n" " -b, --buildindex: Generate a file containing an index of pages ids and titles\n" " per stream. Each line contains: offset-to-stream:pageid:pagetitle\n" " If filename ends in '.bz2' or '.bz2' plus a file extension .[a-z]*,\n" -" the file will be written in bz2 format.\n" +" the file will be written in bz2 format; if it ends in '.gz' or \n" +" '.gz' plus a file extension .[a-z]*, it wll be written in gz format.\n" +" -i --inpath: If not specified, input stream will be read from stdin. Otherwise,\n" +" it will be read from the specified file; if the file ends in gz\n" +" of .bz2 it will be decompressed on the fly.\n" +" -o --outpath: If not specified, output will be written to stdout, otherwise to\n" +" the file specified. If the filename ends in .bz2 or .gz, it will\n" +" use the appropriate compression. IF NOT it will be written\n" +" uncompressed, which probably defeats the point of this program.\n" " -v, --verbose: Write lots of debugging output to stderr. This option can be used\n" " multiple times to increase verbosity.\n" " -h, --help Show this help message\n" " -V, --version Display the version of this program and exit\n\n" "Report bugs in recompressxml to .\n\n" "See also checkforbz2footer(1), dumpbz2filefromoffset(1), dumplastbz2block(1),\n" "findpageidinbz2xml(1), writeuptopageid(1)\n\n"; if (message) { fprintf(stderr,"%s\n\n",message); } fprintf(stderr,"%s",help); exit(-1); } void show_version(char *version_string) { char * copyright = "Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" "This program is free software: you can redistribute it and/or modify it\n" "under the terms of the GNU General Public License as published by the\n" "Free Software Foundation, either version 2 of the License, or (at your\n" "option) any later version.\n\n" "This program is distributed in the hope that it will be useful, but\n" "WITHOUT ANY WARRANTY; without even the implied warranty of \n" "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" "Public License for more details.\n\n" "You should have received a copy of the GNU General Public License along\n" "with this program. If not, see \n\n" "Written by Ariel T. Glenn.\n"; fprintf(stderr,"recompressxml %s\n", version_string); fprintf(stderr,"%s",copyright); exit(-1); } -void setupIndexBz2Stream() { - int bz_verbosity = 0; - int bz_workFactor = 0; - int bz_blockSize100k = 9; - - strm_indx.bzalloc = NULL; - strm_indx.bzfree = NULL; - strm_indx.opaque = NULL; - - /* init bzip compression stuff */ - BZ2_bzCompressInit(&(strm_indx), bz_blockSize100k, bz_verbosity, bz_workFactor); -} - void setupRegexps() { matchPageTitleExpr = (regmatch_t *)malloc(sizeof(regmatch_t)*2); regcomp(&compiledMatchPageTitleExpr, pageTitleExpr, REG_EXTENDED); matchIdExpr = (regmatch_t *)malloc(sizeof(regmatch_t)*2); regcomp(&compiledMatchIdExpr, idExpr, REG_EXTENDED); return; } int startsPage(char *buf) { while (*buf == ' ') buf++; if (!strcmp(buf,pageOpenTag)) return 1; else return 0; } char *hasPageTitle(char *buf) { static char pageTitle[513]; int length = 0; pageTitle[0]='\0'; while (*buf == ' ') buf++; if (regexec(&compiledMatchPageTitleExpr, buf, 2, matchPageTitleExpr, 0 ) == 0) { if (matchPageTitleExpr[1].rm_so >=0) { length = matchPageTitleExpr[1].rm_eo - matchPageTitleExpr[1].rm_so; if (length > 512) { fprintf(stderr,"Page title length > 512 bytes... really? Bailing.\n"); exit(1); } strncpy(pageTitle,buf+matchPageTitleExpr[1].rm_so, length); pageTitle[length] = '\0'; } } return(pageTitle); } int hasId(char *buf) { int id = 0; while (*buf == ' ') buf++; if (regexec(&compiledMatchIdExpr, buf, 2, matchIdExpr, 0 ) == 0) { if (matchIdExpr[1].rm_so >=0) { id = atoi(buf+matchIdExpr[1].rm_so); } } return(id); } int endsXmlBlock(char *buf, int header) { char *pageCloseTag = "\n"; char *mediawikiCloseTag = "\n"; char *siteinfoCloseTag = "\n"; while (*buf == ' ') buf++; /* if we are trying to process the header, check for that only */ if (header) { if (!strcmp(buf,siteinfoCloseTag)) return 1; else return 0; } /* normal check for end of page, end of content */ if (!strcmp(buf,pageCloseTag) || !strcmp(buf,mediawikiCloseTag)) return 1; else return 0; } -off_t endBz2Stream(bz_stream *strm, char *outBuf, int bufSize, FILE *fd) { - int result; - off_t offset; - - do { - strm->avail_in = 0; - strm->next_out = outBuf; - strm->avail_out = 8192; - result = BZ2_bzCompress ( strm, BZ_FINISH ); - fwrite(outBuf,bufSize-strm->avail_out,1,fd); - } while (result != BZ_STREAM_END); - offset = (off_t)strm->total_out_lo32 | ((off_t)strm->total_out_hi32 << 32); - BZ2_bzCompressEnd(strm); - return(offset); -} - -void writeCompressedXmlBlock(int header, int count, off_t *fileOffset, FILE *indexfd, int indexcompressed, int verbose) +void writeCompressedXmlBlock(int header, int count, off_t *fileOffset, InputHandler *ihandler, + OutputHandler *ohandler, OutputHandler *index_ohandler,int verbose) { - bz_stream strm; - int bz_verbosity = 0; - int bz_workFactor = 0; - int bz_blockSize100k = 9; int wroteSomething = 0; int blocksDone = 0; - strm.bzalloc = NULL; - strm.bzfree = NULL; - strm.opaque = NULL; - char *pageTitle = NULL; int pageId = 0; enum States{WantPage,WantPageTitle,WantPageId}; int state = WantPage; - /* init bzip compression stuff */ - BZ2_bzCompressInit(&strm, bz_blockSize100k, bz_verbosity, bz_workFactor); + /* if we're past the first block, we append the rest */ + if (!header && ohandler->path != NULL) + outputhandler_appendmode(ohandler); + + if (ohandler->closed && ohandler->open != NULL) + ohandler->open(ohandler); + if (verbose > 1) + fprintf(stderr,"opened the output file if needed\n"); - while (fgets(inBuf, sizeof(inBuf), stdin) != NULL) { + while (ihandler->fgets(ihandler, inBuf, sizeof(inBuf)-1) != NULL) { if (verbose > 1) { fprintf(stderr,"input buffer is: "); fprintf(stderr,"%s",inBuf); } wroteSomething = 1; - /* add the buffer content to stuff to be compressed */ - strm.next_in = inBuf; - strm.avail_in = strlen(inBuf); - strm.next_out = outBuf; - strm.avail_out = 8192; - - /* we are to build an index. */ - if (indexfd) { + if (index_ohandler) { if (verbose > 2) { fprintf(stderr,"doing index check\n"); } if (state == WantPage) { if (verbose > 2) { fprintf(stderr,"checking for page tag\n"); } if (startsPage(inBuf)) { state = WantPageTitle; } } else if (state == WantPageTitle) { if (verbose > 1) { fprintf(stderr,"checking for page title tag\n"); } pageTitle = hasPageTitle(inBuf); if (pageTitle[0]) { state = WantPageId; } } else if (state == WantPageId) { if (verbose > 1) { fprintf(stderr,"checking for page id tag\n"); } pageId = hasId(inBuf); if (pageId) { state = WantPage; - if (indexcompressed) { - if (verbose) { - fprintf(stderr,"writing line to compressed index file\n"); - } - sprintf(inBuf_indx,"%"PRId64":%d:%s\n",*fileOffset,pageId,pageTitle); - strm_indx.next_in = inBuf_indx; - strm_indx.avail_in = strlen(inBuf_indx); - do { - if (verbose > 2) { - fprintf(stderr,"bytes left to read for index compression: %d\n",strm_indx.avail_in); - } - strm_indx.next_out = outBuf_indx; - strm_indx.avail_out = 8192; - BZ2_bzCompress ( &strm_indx, BZ_RUN ); - fwrite(outBuf_indx,sizeof(outBuf_indx)-strm_indx.avail_out,1,indexfd); - } while (strm_indx.avail_in >0); - } - else { - if (verbose) { - fprintf(stderr,"writing line to index file\n"); - } - fprintf(indexfd,"%"PRId64":%d:%s\n",*fileOffset,pageId,pageTitle); + if (verbose) { + fprintf(stderr,"writing line to index file\n"); } + sprintf(outBuf_indx,"%"PRId64":%d:%s\n",*fileOffset,pageId,pageTitle); + index_ohandler->write(index_ohandler,outBuf_indx,strlen(outBuf_indx)); pageId = 0; pageTitle = NULL; } } } - do { - if (verbose > 2) { - fprintf(stderr,"bytes left to read for text compression: %d\n",strm.avail_in); - } - strm.next_out = outBuf; - strm.avail_out = 8192; - BZ2_bzCompress ( &strm, BZ_RUN ); - fwrite(outBuf,sizeof(outBuf)-strm.avail_out,1,stdout); - } while (strm.avail_in > 0); - if (verbose > 1) fprintf(stderr,"avail_out is now: %d\n", strm.avail_out); - + if (inBuf[0]) + ohandler->write(ohandler, inBuf, strlen(inBuf)); if (endsXmlBlock(inBuf, header)) { /* special case: doing the siteinfo stuff at the beginning */ + inBuf[0] = '\0'; if (verbose) { - fprintf(stderr,"end of header found\n"); + fprintf(stderr,"end of header, page, or mw found\n"); } if (header) { - *fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout); + *fileOffset = outputhandler_get_offset(ohandler); return; } - blocksDone++; if (blocksDone % count == 0) { if (verbose) fprintf(stderr, "end of xml block found\n"); - /* close down bzip stream, we are done with this block */ - *fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout); + /* close down stream, we are done with this block */ + if (ohandler->close) + ohandler->close(ohandler); + *fileOffset = outputhandler_get_offset(ohandler); return; } } + inBuf[0] = '\0'; } if (verbose) fprintf(stderr,"eof reached\n"); if (wroteSomething) { - /* close down bzip stream, we are done with this block */ - *fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout); + /* close down stream, we are done with this block */ + if (ohandler->close) + ohandler->close(ohandler); + *fileOffset = outputhandler_get_offset(ohandler); + return; } + /* done with all input so close up shop */ + if (ohandler->close) + ohandler->close(ohandler); return; } int main(int argc, char **argv) { int optindex=0; int optc; off_t offset; struct option optvalues[] = { {"buildindex", 1, 0, 'b'}, + {"inpath", 1, 0, 'i'}, + {"outpath", 1, 0, 'o'}, {"help", 0, 0, 'h'}, {"pagesperstream", 1, 0, 'p'}, {"verbose", 0, 0, 'v'}, {"version", 0, 0, 'V'}, {NULL, 0, NULL, 0} }; int count = 0; char *indexFilename = NULL; + char *inpath = NULL; int verbose = 0; FILE *indexfd = NULL; - int indexcompressed = 0; - char *dotPosition = NULL; + char *outpath = NULL; + InputHandler *ihandler = NULL; + OutputHandler *ohandler = NULL; + OutputHandler *index_ohandler = NULL; while (1) { - optc=getopt_long_only(argc,argv,"p:b:v", optvalues, &optindex); + optc=getopt_long_only(argc,argv,"p:b:i:o:v", optvalues, &optindex); if (optc=='b') { indexFilename = optarg; } + else if (optc=='i') { + inpath = optarg; + } + else if (optc=='o') { + outpath = optarg; + } else if (optc=='h') usage(NULL); else if (optc=='p') { if (!(isdigit(optarg[0]))) usage(NULL); count=atoi(optarg); } else if (optc=='v') verbose++; else if (optc=='V') show_version(VERSION); else if (optc==-1) break; else usage("unknown option or other error\n"); } if (count <= 0) { usage("bad or no argument given for count.\n"); } if (indexFilename) { if (verbose) { fprintf(stderr,"setting up index file creation.\n"); } indexfd = fopen(indexFilename, "w"); if (! indexfd) { usage("failed to open index file for write.\n"); } - if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) { - /* filename ends in .bz2 */ - indexcompressed++; - } - else { - dotPosition = strrchr(indexFilename, '.'); - if (dotPosition != NULL) { - *dotPosition = '\0'; - if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) { - /* filename ends in .bz2.something */ - indexcompressed++; - } - *dotPosition = '.'; - } - } - if (indexcompressed) { - if (verbose) { - fprintf(stderr,"index file will be bz2 compressed.\n"); - } - setupIndexBz2Stream(); - } + index_ohandler = outputhandler_init(indexFilename); + if (index_ohandler->open != NULL) + index_ohandler->open(index_ohandler); + } + + ihandler = inputhandler_init(inpath); + if (ihandler->open != NULL) { + ihandler->open(ihandler); } + ohandler = outputhandler_init(outpath); + setupRegexps(); offset = (off_t)0; /* deal with the XML header */ - writeCompressedXmlBlock(1,count,&offset,indexfd,indexcompressed,verbose); + writeCompressedXmlBlock(1,count,&offset,ihandler,ohandler,index_ohandler,verbose); - while (!feof(stdin)) { - writeCompressedXmlBlock(0,count,&offset,indexfd,indexcompressed,verbose); + if (verbose) { + if (ihandler->eof(ihandler)) + fprintf(stderr, "EOF reached for input file\n"); + } + while (!ihandler->eof(ihandler)) { + writeCompressedXmlBlock(0,count,&offset,ihandler,ohandler,index_ohandler,verbose); + if (verbose) { + if (ihandler->eof(ihandler)) + fprintf(stderr, "EOF reached for input file\n"); + } } if (indexFilename) { - if (indexcompressed) { - if (verbose) { - fprintf(stderr,"closing bz2 index file stream.\n"); - } - endBz2Stream(&strm_indx, outBuf_indx, sizeof(outBuf_indx), indexfd); - } if (verbose) { fprintf(stderr,"closing index file.\n"); } - fclose(indexfd); + if (index_ohandler->close != NULL) + index_ohandler->close(index_ohandler); } + if (ihandler->close != NULL) + ihandler->close(ihandler); exit(0); }