diff --git a/xmldumps-backup/mwbzutils/Makefile b/xmldumps-backup/mwbzutils/Makefile index 90dd3e8..261282c 100644 --- a/xmldumps-backup/mwbzutils/Makefile +++ b/xmldumps-backup/mwbzutils/Makefile @@ -1,170 +1,180 @@ # ------------------------------------------------------------------ # This Makefile builds binaries which rely on three source files # from libbzip2 version 1.0.6. (See bz2libfuncs.c, bzlib.h and # bzlib_private.h; the first is slightly modified while the # second is unchanged from the library version.) # # The copyright for those two files is as follows: # # bzip2/libbzip2 version 1.0.6 of 6 September 2010 # Copyright (C) 1996-2010 Julian Seward # # Those files are released under the terms of the license contained # in the file LICENSE_BZ. # # All other files are released under the GPL, copyright (C) Ariel T. Glenn # 2010-2019: see the file COPYING for details. # ------------------------------------------------------------------ VERSION = "0.0.9" CC ?= gcc BIGFILES = -D_FILE_OFFSET_BITS=64 CPPFLAGS += $(BIGFILES) -DVERSION=\"$(VERSION)\" CFLAGS ?= -Wall -Werror -O2 build: checkforbz2footer dumpbz2filefromoffset \ dumplastbz2block findpageidinbz2xml \ recompressxml writeuptopageid compressedmanpages \ - getlastidinbz2xml + getlastidinbz2xml showcrcs NAME_CHECKFORBZ2FOOTER = "Check if bzip2 file ends with bz2 magic footer" NAME_DUMPBZ2FILEFROMOFFSET = "Write MediaWiki XML pages from bzip2 file starting from offset" NAME_DUMPLASTBZ2BLOCK = "Find last bz2 block in bzip2 file and dump contents" NAME_FINDPAGEIDINBZ2XML = "Display offset of bz2 block for given page id in bzip2 MediaWiki XML file" NAME_FINDLASTPAGEIDINBZ2XML = "Display last page id bzip2 MediaWiki XML file" NAME_RECOMPRESSXML = "Bz2 compress MediaWiki XML input in batches of pages" +NAME_SHOWCRCS = "Show crcs and offsets of blocks in bz2-compressed file" NAME_WRITEUPTOPAGEID = "Write range of page content from MediaWiki XML input" PREFIX ?= "/usr/local" BINDIR = $(DESTDIR)$(PREFIX)/bin/ MANDIR = $(DESTDIR)$(PREFIX)/share/man/man1/ DOCDIR = $(DESTDIR)$(PREFIX)/share/doc/mwbzutils/ GZIP = /bin/gzip HELP2MAN = /usr/bin/help2man SHELL = /bin/sh DISTNAME = mwbzutils-$(VERSION) LIBS = -lbz2 OBJSBZ = bzlibfuncs.o OBJS = mwbzlib.o $(OBJSBZ) checkforbz2footer: $(OBJSBZ) mwbzlib.o checkforbz2footer.o $(CC) $(LDFLAGS) -o checkforbz2footer checkforbz2footer.o $(OBJS) $(LIBS) dumpbz2filefromoffset: $(OBJSBZ) mwbzlib.o dumpbz2filefromoffset.o $(CC) $(LDFLAGS) -o dumpbz2filefromoffset dumpbz2filefromoffset.o $(OBJS) $(LIBS) dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o $(CC) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o $(OBJS) $(LIBS) findpageidinbz2xml: $(OBJSBZ) mwbzlib.o httptiny.o findpageidinbz2xml.o $(CC) $(LDFLAGS) -o findpageidinbz2xml findpageidinbz2xml.o httptiny.o $(OBJS) $(LIBS) -lz getlastidinbz2xml: $(OBJSBZ) mwbzlib.o getlastidinbz2xml.o $(CC) $(LDFLAGS) -o getlastidinbz2xml getlastidinbz2xml.o $(OBJS) $(LIBS) recompressxml: $(OBJSBZ) iohandlers.o recompressxml.o $(CC) $(LDFLAGS) -o recompressxml iohandlers.o recompressxml.o $(LIBS) -lz +showcrcs: $(OBJSBZ) iohandlers.o showcrcs.o + $(CC) $(LDFLAGS) -o showcrcs showcrcs.o $(OBJS) $(LIBS) + writeuptopageid: $(OBJSBZ) iohandlers.o writeuptopageid.o $(CC) $(LDFLAGS) -o writeuptopageid iohandlers.o writeuptopageid.o $(LIBS) -lz %.o: %.c $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ compressedmanpages: docs/dumplastbz2block.1.gz docs/findpageidinbz2xml.1.gz \ docs/checkforbz2footer.1.gz docs/dumpbz2filefromoffset.1.gz \ - docs/recompressxml.1.gz docs/writeuptopageid.1.gz + docs/recompressxml.1.gz docs/showcrcs.1.gz docs/writeuptopageid.1.gz docs/%.1.gz: docs/%.1 cat $< | $(GZIP) > $@ # this target should only be made when updating the source if the version # or the usage mssages change manpages: dumplastbz2block.1 findpageidinbz2xml.1 \ checkforbz2footer.1 dumpbz2filefromoffset.1 \ recompressxml.1 writeuptopageid.1 \ - getlastidinbz2xml.1 + getlastidinbz2xml.1 showcrcs.1 echo "Don't forget to commit your manpage changes to the repo" checkforbz2footer.1 : checkforbz2footer - $(HELP2MAN) --section 1 --no-info --name $(NAME_CHECKFORBZ2FOOTER) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_CHECKFORBZ2FOOTER) \ --no-discard-stderr ./checkforbz2footer > docs/checkforbz2footer.1 dumpbz2filefromoffset.1 : dumpbz2filefromoffset - $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPBZ2FILEFROMOFFSET) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPBZ2FILEFROMOFFSET) \ --no-discard-stderr ./dumpbz2filefromoffset > docs/dumpbz2filefromoffset.1 dumplastbz2block.1 : dumplastbz2block - $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPLASTBZ2BLOCK) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPLASTBZ2BLOCK) \ --no-discard-stderr ./dumplastbz2block > docs/dumplastbz2block.1 findpageidinbz2xml.1 : findpageidinbz2xml - $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \ --no-discard-stderr ./findpageidinbz2xml > docs/findpageidinbz2xml.1 getlastidinbz2xml.1 : getlastidinbz2xml - $(HELP2MAN) --section 1 --no-info --name $(NAME_GETLASTIDINBZ2XML) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_GETLASTIDINBZ2XML) \ --no-discard-stderr ./getlastidinbz2xml > docs/getlastidinbz2xml.1 recompressxml.1 : recompressxml - $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \ --no-discard-stderr ./recompressxml > docs/recompressxml.1 +showcrcs.1 : showcrcs + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_SHOWCRCS) \ + --no-discard-stderr ./showcrcs > docs/showcrcs.1 writeuptopageid.1 : writeuptopageid - $(HELP2MAN) --section 1 --no-info --name $(NAME_WRITEUPTOPAGEID) \ + LC_TIME=C $(HELP2MAN) --section 1 --no-info --name $(NAME_WRITEUPTOPAGEID) \ --no-discard-stderr ./writeuptopageid > docs/writeuptopageid.1 install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset \ recompressxml writeuptopageid compressedmanpages getlastidinbz2xml install --directory $(BINDIR) install --mode=755 checkforbz2footer $(BINDIR) install --mode=755 dumplastbz2block $(BINDIR) install --mode=755 dumpbz2filefromoffset $(BINDIR) install --mode=755 findpageidinbz2xml $(BINDIR) install --mode=755 getlastidinbz2xml $(BINDIR) install --mode=755 recompressxml $(BINDIR) + install --mode=755 showcrcs $(BINDIR) install --mode=755 writeuptopageid $(BINDIR) install --directory $(MANDIR) install --mode=644 docs/*.1.gz $(MANDIR) install --directory $(DOCDIR) install --mode=644 README $(DOCDIR) install --mode=644 LICENSE_BZ $(DOCDIR) install --mode=644 COPYING $(DOCDIR) uninstall: rm -f $(BINDIR)dumplastbz2block rm -f $(BINDIR)findpageidinbz2xml rm -f $(BINDIR)getlastidinbz2xml rm -f $(BINDIR)checkforbz2footer rm -f $(BINDIR)dumpbz2filefromoffset rm -f $(BINDIR)recompressxml + rm -f $(BINDIR)showcrcs rm -f $(BINDIR)writeuptopageid rm -f $(MANDIR)*.1.gz rm -f $(DOCDIR)README rm -f $(DOCDIR)LICENSE_BZ rm -f $(DOCDIR)COPYING clean: rm -f *.o *.a dumplastbz2block findpageidinbz2xml \ getlastidinbz2xml \ checkforbz2footer dumpbz2filefromoffset \ - recompressxml writeuptopageid docs/*.1.gz + recompressxml showcrcs writeuptopageid \ + docs/*.1.gz distclean: clean rm -f $(DISTNAME) rm -f *.tar.gz reallyclean: distclean rm -f docs/*.1 dist: rm -f $(DISTNAME) ln -s -f . $(DISTNAME) tar cvf $(DISTNAME).tar \ $(DISTNAME)/*.c \ $(DISTNAME)/*.h \ $(DISTNAME)/Makefile \ $(DISTNAME)/LICENSE_BZ \ $(DISTNAME)/COPYING \ $(DISTNAME)/README \ $(DISTNAME)/CHANGES \ $(DISTNAME)/docs/*1 gzip -v $(DISTNAME).tar diff --git a/xmldumps-backup/mwbzutils/README b/xmldumps-backup/mwbzutils/README index a377a96..84265f6 100644 --- a/xmldumps-backup/mwbzutils/README +++ b/xmldumps-backup/mwbzutils/README @@ -1,85 +1,89 @@ What is this? It is a tiny suite of utilities that hapless WMF employees use to massage the XML dump files so that we can produce them on a more regular basis. More specifically, they allow us to do various things with bz2 files quickly instead of requiring a serial read/decompress of the file. Some of these files range from 2 to 30 GB in size, so serial access is too slow. The files bz2libfuncs.c, bzlib.h and bzlib_private.h are taken from bzip2/libbzip2 version 1.0.6 of 6 September 2010 (Copyright (C) 1996-2010 Julian Seward ) and as such their copyright license is in the file LICENSE_BZ; all other files in the package are released under the GPL, see the file COPYING for details. Scripts: check_bz2_pagerange.py - Checks that the first and last page of a bz2 content checkpoint filename match the contents in the file, i.e. the first page id in the name is the first page id contained in the file, and the same for the last page id. This uses the MediaWiki api as well as the getlastidinbz2xml utility for which see below. Utilities: checkforbz2footer - Tests to see if the bz2 file specified on the command line has a bz2 footer (if it does it is likely to be intact). Exits with 0 if found, 1 otherwise. dumpbz2filefromoffset - Uncompresses the file from the first bz2 block found after the specified offset, and dumps the results to stdout. This will first look for and dump the header, up to and including the tag; then it will find the first tag in the first bz2 block after the specified output and dump the contents from that point on. dumplastbz2block - Finds the last bz2 block marker in a file and dumps whatever can be decompressed after that point; the header of the file must be intact in order for any output to be produced. This will produce output for truncated files as well, as long as there is "enough" data after the bz2 block marker. Exits with 0 if decompression of some data can be done, 1 if decompression fails, and -1 on error. findpageidinbz2xml - Given a bzipped and possibly truncated file, and a page id, hunt for the page id in the file; this assumes that the bz2 header is intact and that page ids are steadily increasing throughout the file. It writes the offset of the relevant block (from beginning of file) and the first pageid found in that block, to stdout. Format of output: position:xxxxx pageid:nnn It exits with 0 on success, -1 on error. getlastidinbz2xml - Given a bzipped xml content file and a page or rev id and the type (either 'page' or 'rev'), return the last such id in the xml file. recompresszml - Reads an xml stream of pages and writes multiple bz2 compressed streams, concatenated, to stdout, with the specified number of pages per stream. The mediawiki site info header is in its own bz2 stream. Each stream can be extracted as a separate file by an appropriate tool, checking for the byte-aligned string "BZh91AY&SY" and a following tag (after uncompressing the first chunk of data after that string). Alternatively, a tool can seek to the location of one of the streams in order to find a particular page. An index of file-offset:page-id:page-title lines is written to a specified file if desired; the index file will be bz2 compressed if the filename given ends with .bz2. +showcrcs - Given a bzip2 compressed file, extracts the byte offsets and crcs + for each block and for the entire file, and displays them; it can + also try to compute the file crc from the block crcs and display that. + Library routines: mwbz2lib.c - various utility functions (bitmasks, shifting and comparing bytes, setting up bz2 files for decompression, etc) External library routines: bz2libfuncs.c - the BZ2_bzDecompress() routine, modified so that it does not do a check of the cumulative CRC (since we read from an arbitrary point in most of these files, we won't have a cumulative CRC that makes any sense). It's a one line fix but it requires unRLE_obuf_to_output_FAST() which is marked static in the original library, so that's in here too. diff --git a/xmldumps-backup/mwbzutils/dumplastbz2block.c b/xmldumps-backup/mwbzutils/dumplastbz2block.c index 03b14a3..396c4bb 100644 --- a/xmldumps-backup/mwbzutils/dumplastbz2block.c +++ b/xmldumps-backup/mwbzutils/dumplastbz2block.c @@ -1,149 +1,149 @@ #include #include #include #include #include #include #include #include #include #include #include "mwbzutils.h" void usage(char *message) { char * help = "Usage: dumplastbz2block [--version|--help]\n" " or: dumplastbz2block \n\n" "Find the last bz2 block marker in a file and dump whatever can be\n" "decompressed after that point. The header of the file must be intact\n" "in order for any output to be produced.\n" "This will produce output for truncated files as well, as long as there\n" "is 'enough' data after the block marker.\n" "Exits with 0 if some decompressed data was written, 1 if no data could\n" "be uncompressed and -1 on error.\n\n" "Options:\n\n" "Flags:\n\n" " -h, --help Show this help message\n" " -v, --version Display the version of this program and exit\n\n" "Arguments:\n\n" " Name of the file to process\n\n" "Report bugs in dumplastbz2block to .\n\n" "See also checkforbz2footer(1), dumpbz2filefromoffset(1), findpageidinbz2xml(1),\n" "recompressxml(1), writeuptopageid(1)\n\n"; if (message) { fprintf(stderr,"%s\n\n",message); } fprintf(stderr,"%s",help); exit(-1); } void show_version(char *version_string) { char * copyright = "Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" "This program is free software: you can redistribute it and/or modify it\n" "under the terms of the GNU General Public License as published by the\n" "Free Software Foundation, either version 2 of the License, or (at your\n" "option) any later version.\n\n" "This program is distributed in the hope that it will be useful, but\n" "WITHOUT ANY WARRANTY; without even the implied warranty of \n" "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" "Public License for more details.\n\n" "You should have received a copy of the GNU General Public License along\n" "with this program. If not, see \n\n" "Written by Ariel T. Glenn.\n"; fprintf(stderr,"dumplastbz2block %s\n", version_string); fprintf(stderr,"%s",copyright); exit(-1); } int main(int argc, char **argv) { bz_info_t bfile; int fin; int result; buf_info_t *b; int length = 5000; /* output buffer size */ int optc; int optindex=0; struct option optvalues[] = { {"help", 0, 0, 'h'}, {"version", 0, 0, 'v'}, {NULL, 0, NULL, 0} }; if (argc != 2) { usage("Missing option or argument."); exit(-1); } while (1) { optc=getopt_long_only(argc,argv,"hv", optvalues, &optindex); if (optc=='h') usage(NULL); else if (optc=='v') show_version(VERSION); else if (optc==-1) break; else usage("Unknown option or other error\n"); } if (optind >= argc) { usage("Missing filename argument."); } fin = open (argv[optind], O_RDONLY); if (fin < 0) { fprintf(stderr,"failed to open file %s for read\n", argv[optind]); exit(-1); } bfile.file_size = get_file_size(fin); bfile.footer = init_footer(); bfile.marker = NULL; result = check_file_for_footer(fin, &bfile); if (result == -1) { bfile.position = bfile.file_size; } else { bfile.position = bfile.file_size - (off_t)11; /* size of footer, perhaps with 1 byte extra */ } bfile.position -=(off_t)6; /* size of marker */ bfile.initialized = 0; b = init_buffer(length); bfile.bytes_read = 0; - if (find_first_bz2_block_from_offset(&bfile, fin, bfile.position, BACKWARD) <= (off_t)0) { + if (find_first_bz2_block_from_offset(&bfile, fin, bfile.position, BACKWARD, (off_t)0, 1) <= (off_t)0) { fprintf(stderr,"failed to find block in bz2file\n"); exit(-1); } while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof) && (! bfile.position == (off_t)0)) { if (bfile.bytes_read) { fwrite(b->next_to_read,b->bytes_avail,1,stdout); b->next_to_read = b->end; b->bytes_avail = 0; b->next_to_fill = b->buffer; /* empty */ bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; } else { /* should never happen */ fprintf(stderr,"there was a block but now it's gone, giving up\n"); exit(-1); } } if (b->bytes_avail) { fwrite(b->next_to_read,b->bytes_avail,1,stdout); b->next_to_read = b->end; b->bytes_avail = 0; b->next_to_fill = b->buffer; /* empty */ bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; } close(fin); exit(0); } diff --git a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c index 05d0aa1..eb3ee93 100644 --- a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c +++ b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c @@ -1,652 +1,652 @@ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mwbzutils.h" void usage(char *message) { char * help = "Usage: findpageidinbz2xml --filename file --pageid id [--stubfile] [--useapi] [--verbose]\n" " [--help] [--version]\n\n" "Show the offset of the bz2 block in the specified MediaWiki XML dump file\n" "containing the given page id. This assumes that the bz2 header of the file\n" "is intact and that page ids are steadily increasing throughout the file.\n\n" "If the page id is found, a line in the following format will be written to stdout:\n" " position:xxxxx pageid:nnn\n\n" "where 'xxxxx' is the offset of the block from the beginning of the file, and\n" "'nnn' is the id of the first page encountered in that block.\n\n" "Note:\n" "This program may use the MediaWiki api to find page ids from revision ids\n" "if 'useapi' is specified.\n" "It may use a stub file to find page ids from rev ids if 'stubfile' is specified.\n" "It will only do one of the above if it has been reading from the file for some\n" "large number of iterations without findind a page tag (some pages have > 500K\n" "revisions and a heck of a lot of text).\n" "If both 'useapi' and 'stubfile' are specified, the api will be used as it is faster.\n\n" "Exits with 0 in success, -1 on error.\n\n" "Options:\n\n" " -f, --filename name of file to search\n" " -p, --pageid page_id of page for which to search\n" " -s, --stubfile name of MediaWiki XML stub file to fall back on (see 'Note' above)\n" " -a, --useapi fall back to the api if stuck (see 'Note' above)\n" " -V, --verbose show search process; specify multiple times for more output\n" " -h, --help Show this help message\n" " -V, --version Display the version of this program and exit\n\n" "Report bugs in findpageidinbz2xml to .\n\n" "See also dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1),\n" "recompressxml(1), writeuptopageid(1)\n\n"; if (message) { fprintf(stderr,"%s\n\n",message); } fprintf(stderr,"%s",help); exit(-1); } void show_version(char *version_string) { char * copyright = "Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" "This program is free software: you can redistribute it and/or modify it\n" "under the terms of the GNU General Public License as published by the\n" "Free Software Foundation, either version 2 of the License, or (at your\n" "option) any later version.\n\n" "This program is distributed in the hope that it will be useful, but\n" "WITHOUT ANY WARRANTY; without even the implied warranty of \n" "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" "Public License for more details.\n\n" "You should have received a copy of the GNU General Public License along\n" "with this program. If not, see \n\n" "Written by Ariel T. Glenn.\n"; fprintf(stderr,"findpageidinbz2xml %s\n", version_string); fprintf(stderr,"%s",copyright); exit(-1); } /* find the first bz2 block marker in the file, from its current position, then set up for decompression from that point returns: 0 on success -1 if no marker or other error */ int init_and_read_first_buffer_bz2_file(bz_info_t *bfile, int fin) { int res; bfile->bufin_size = BUFINSIZE; bfile->marker = init_marker(); bfile->bytes_read = 0; bfile->bytes_written = 0; bfile->eof = 0; bfile->file_size = get_file_size(fin); bfile->initialized++; res = find_next_bz2_block_marker(fin, bfile, FORWARD); if (res ==1) { init_decompress(bfile); decompress_header(fin, bfile); setup_first_buffer_to_decompress(fin, bfile); return(0); } else { fprintf(stderr,"Failed to find the next block marker\n"); return(-1); } } extern char * geturl(char *hostname, int port, char *url); char *get_hostname_from_xml_header(int fin) { regmatch_t *match_base_expr; regex_t compiled_base_expr; /* http://el.wiktionary.org/wiki/... */ /* http://trouble.localdomain/wiki/ */ char *base_expr = "http://([^/]+)/"; int length=5000; /* output buffer size */ buf_info_t *b; bz_info_t bfile; int hostname_length = 0; off_t old_position; static char hostname[256]; bfile.initialized = 0; bfile.marker = NULL; regcomp(&compiled_base_expr, base_expr, REG_EXTENDED); match_base_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*2); b = init_buffer(length); bfile.bytes_read = 0; bfile.position = (off_t)0; old_position = lseek(fin,(off_t)0,SEEK_CUR); lseek(fin,(off_t)0,SEEK_SET); while ((get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD)>=0) && (! bfile.eof)) { /* so someday the header might grow enough that isn't in the first 1000 characters but we'll ignore that for now */ if (bfile.bytes_read && b->bytes_avail > 1000) { /* get project name and language name from the file header format: Βικιλεξικό http://el.wiktionary.org/wiki/... */ if (regexec(&compiled_base_expr, (char *)b->next_to_read, 2, match_base_expr, 0 ) == 0) { if (match_base_expr[1].rm_so >=0) { hostname_length = match_base_expr[1].rm_eo - match_base_expr[1].rm_so; if (hostname_length > sizeof(hostname)) { fprintf(stderr,"Very long hostname, giving up\n"); break; } else { memcpy(hostname,(char *)b->next_to_read + match_base_expr[1].rm_so, hostname_length); hostname[hostname_length] = '\0'; b->next_to_read = b->end; b->bytes_avail = 0; b->next_to_fill = b->buffer; /* empty */ bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; BZ2_bzDecompressEnd ( &(bfile.strm) ); lseek(fin,old_position,SEEK_SET); free_buffer(b); return(hostname); } } } else { break; } } } BZ2_bzDecompressEnd ( &(bfile.strm) ); lseek(fin,old_position,SEEK_SET); free_buffer(b); return(NULL); } int has_xml_tag(char *line, char *tag) { return(! strncmp(line,tag,strlen(tag))); } /* assumes the open tag, close tag and avlaue are all on the same line */ long int get_xml_elt_value(char *line, char *tag) { return(atol(line+strlen(tag))); } /* returns pageid, or -1 on error. this requires the name of a stub file which contains all page ids and revisions ids in our standard xml format. It scans through the entire file looking for the page id which corresponds to the revision id. This can take up to 5 minutes for the larger stub history files; clearly we don't want to do this unless we have no other option. we need this in the case where the page text is huge (eg en wp pageid 5137507 which has a cumulative text length across all revisions of > 163 GB. This can take over two hours to uncompress and scan through looking for the next page id, so we cheat */ long int get_page_id_from_rev_id_via_stub(long int rev_id, char *stubfile) { gzFile gz; int page_id = -1; char buf[8192]; char *bufp; enum States{WantPage,WantPageID,WantRevOrPage,WantRevID}; int state; long int temp_rev_id; gz = gzopen(stubfile,"r"); state = WantPage; while ((bufp = gzgets(gz,buf,8191)) != NULL) { while (*bufp == ' ') bufp++; if (state == WantPage) { if (has_xml_tag(bufp,"")) { state = WantPageID; } } else if (state == WantPageID) { if (has_xml_tag(bufp,"")) { page_id = get_xml_elt_value(bufp,""); state = WantRevOrPage; } } else if (state == WantRevOrPage) { if (has_xml_tag(bufp,"")) { state = WantRevID; } else if (has_xml_tag(bufp,"")) { state = WantPageID; } } else if (state == WantRevID) { if (has_xml_tag(bufp,"")) { temp_rev_id = get_xml_elt_value(bufp,""); if (temp_rev_id == rev_id) { return(page_id); } /* this permits multiple revs in the page */ state = WantRevOrPage; } } } return(-1); } /* returns pageid, or -1 on error. this requires network access, it does an api call to the appropriate server for the appropriate project we need this in the case where the page text is huge (eg en wp pageid 5137507 which has a cumulative text length across all revisions of > 163 GB. This can take over two hours to uncompress and scan through looking for the next page id, so we cheat */ int get_page_id_from_rev_id_via_api(long int rev_id, int fin) { /* char hostname[80]; */ char *hostname; char url[80]; char *buffer; long int page_id = -1; char *api_call = "/w/api.php?action=query&format=xml&revids="; regmatch_t *match_page_id_expr; regex_t compiled_page_id_expr; char *page_id_expr = " sizeof(hostname)-2) { fprintf(stderr,"language code plus project name is huuuge string, giving up\n"); return(-1); } sprintf(hostname,"%s.%s.org",lang,project); */ sprintf(url,"%s%ld",api_call,rev_id); buffer = geturl(hostname, 80, url); if (buffer == NULL) { return(-1); } else { /* dig the page id out of the buffer format: */ match_page_id_expr = (regmatch_t *)malloc(sizeof(regmatch_t)*3); regcomp(&compiled_page_id_expr, page_id_expr, REG_EXTENDED); if (regexec(&compiled_page_id_expr, buffer, 3, match_page_id_expr, 0 ) == 0) { if (match_page_id_expr[2].rm_so >=0) { page_id = atol(buffer + match_page_id_expr[2].rm_so); } } return(page_id); } } /* get the first page id after position in file if a pageid is found, the structure pinfo will be updated accordingly use_api nonzero means that we will fallback to ask the api about a page that contains a given rev_id, in case we wind up with a huge page which has piles of revisions and we aren't seeing a page tag in a reasonable period of time. returns: 1 if a pageid found, 0 if no pageid found, -1 on error */ int get_first_page_id_after_offset(int fin, off_t position, id_info_t *pinfo, int use_api, int use_stub, char *stubfilename, int verbose) { regmatch_t *match_page, *match_page_id, *match_rev, *match_rev_id; regex_t compiled_page, compiled_page_id, compiled_rev, compiled_rev_id; int length=5000; /* output buffer size */ char *page = ""; char *page_id = "\n[ ]+[^<]+\n([ ]+[0-9]+\n)?[ ]+([0-9]+)\n"; char *rev = ""; char *rev_id_expr = "\n[ ]+([0-9]+)\n"; buf_info_t *b; bz_info_t bfile; long int rev_id=0; long int page_id_found=0; int buffer_count = 0; bfile.initialized = 0; bfile.marker = NULL; regcomp(&compiled_page, page, REG_EXTENDED); regcomp(&compiled_page_id, page_id, REG_EXTENDED); regcomp(&compiled_rev, rev, REG_EXTENDED); regcomp(&compiled_rev_id, rev_id_expr, REG_EXTENDED); match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1); match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*3); match_rev = (regmatch_t *)malloc(sizeof(regmatch_t)*1); match_rev_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2); b = init_buffer(length); pinfo->bits_shifted = -1; pinfo->position = (off_t)-1; pinfo->id = -1; bfile.bytes_read = 0; - if (find_first_bz2_block_from_offset(&bfile, fin, position, FORWARD) <= (off_t)0) { + if (find_first_bz2_block_from_offset(&bfile, fin, position, FORWARD, (off_t)0, 1) <= (off_t)0) { if (verbose) fprintf(stderr,"failed to find block in bz2file after offset %"PRId64" (1)\n", position); return(-1); } if (verbose) fprintf(stderr,"found first block in bz2file after offset %"PRId64"\n", position); while (!get_buffer_of_uncompressed_data(b, fin, &bfile, FORWARD) && (! bfile.eof)) { buffer_count++; if (verbose >=2) fprintf(stderr,"buffers read: %d\n", buffer_count); if (bfile.bytes_written) { while (regexec(&compiled_page_id, (char *)b->next_to_read, 3, match_page_id, 0 ) == 0) { if (match_page_id[2].rm_so >=0) { if (verbose){ fwrite(b->next_to_read+match_page_id[2].rm_so, sizeof(unsigned char), match_page_id[2].rm_eo - match_page_id[2].rm_so, stderr); fwrite("\n",1,1,stderr); } pinfo->id = atoi((char *)(b->next_to_read+match_page_id[2].rm_so)); pinfo->position = bfile.block_start; pinfo->bits_shifted = bfile.bits_shifted; return(1); /* write up to and including page id tag to stdout */ /* fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout); b->next_to_read = b->next_to_read+match_page_id[0].rm_eo; b->bytes_avail -= match_page_id[0].rm_eo; */ } else { /* should never happen */ fprintf(stderr,"regex gone bad...\n"); exit(-1); } } if (use_api || use_stub) { if (!rev_id) { if (regexec(&compiled_rev_id, (char *)b->next_to_read, 2, match_rev_id, 0 ) == 0) { if (match_rev_id[1].rm_so >=0) { rev_id = atoi((char *)(b->next_to_read+match_rev_id[1].rm_so)); } } } /* this needs to be called if we don't find a page by X tries, or Y buffers read, and we need to retrieve a page id from a revision id in the text instead where does this obscure figure come from? assume we get at least 2-1 compression ratio, text revs are at most 10mb plus a little, then if we read this many buffers we should have at least one rev id in there. 20 million / 5000 or whatever it is, is 4000 buffers full of crap hopefully that doesn't take forever. */ if (buffer_count>(20000000/BUFINSIZE) && rev_id) { if (verbose) fprintf(stderr, "passed retries cutoff for using api\n"); if (use_api) { page_id_found = get_page_id_from_rev_id_via_api(rev_id, fin); } else { /* use_stub */ page_id_found = get_page_id_from_rev_id_via_stub(rev_id, stubfilename); } pinfo->id = page_id_found +1; /* want the page after this offset, not the one we're in */ pinfo->position = bfile.block_start; pinfo->bits_shifted = bfile.bits_shifted; return(1); } } /* FIXME this is probably wrong */ if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) { /* write everything up to but not including the page tag to stdout */ /* fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout); */ move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so); bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; } else if ((use_api || use_stub) && (regexec(&compiled_rev, (char *)b->next_to_read, 1, match_rev, 0 ) == 0)) { /* write everything up to but not including the rev tag to stdout */ /* fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout); */ move_bytes_to_buffer_start(b, b->next_to_read + match_rev[0].rm_so, b->bytes_avail - match_rev[0].rm_so); bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; } else { /* could have the first part of the page or the rev tag... so copy up enough bytes to cover that case */ if (b->bytes_avail> 10) { /* write everything that didn't match, but leave 10 bytes, to stdout */ /* fwrite(b->next_to_read,b->bytes_avail - 10,1,stdout); */ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 10, 10); bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; } else { if (buffer_is_empty(b)) { bfile.strm.next_out = (char *)b->buffer; bfile.strm.avail_out = bfile.bufout_size; b->next_to_fill = b->buffer; /* empty */ } else { /* there were only 10 or less bytes so just save em don't write em to stdout */ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail); bfile.strm.next_out = (char *)b->next_to_fill; bfile.strm.avail_out = b->end - b->next_to_fill; } } } } } /* if (b->bytes_avail) { fwrite(b->next_to_read,b->bytes_avail,1,stdout); } */ return(0); } /* search for pageid in a bz2 file, given start and end offsets to search for we guess by the most boring method possible (shrink the interval according to the value found on the last guess, try midpoint of the new interval) multiple calls of this will get the job done. interval has left end = right end if search is complete. this function may return the previous guess and simply shrink the interval. note that a "match" means either that the pageid we find is smaller than the one the caller wants, or is equal. why? because then we can use the output for prefetch for xml dumps and be sure a specific page range is covered :-P return value from guess, or -1 on error. */ int do_iteration(iter_info_t *iinfo, int fin, id_info_t *pinfo, int use_api, int use_stub, char *stubfilename, int verbose) { int res; off_t new_position; off_t interval; /* last_position is somewhere in the interval, perhaps at an end last_value is the value we had at that position */ interval = (iinfo->right_end - iinfo->left_end)/(off_t)2; if (interval == (off_t)0) { interval = (off_t)1; } if (verbose) fprintf(stderr,"interval size is %"PRId64", left end %"PRId64", right end %"PRId64", last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); /* if we're this close, we'll check this value and be done with it */ if (iinfo->right_end -iinfo->left_end < (off_t)2) { new_position = iinfo->left_end; if (verbose >= 2) fprintf(stderr," choosing new position (1) %"PRId64"\n",new_position); iinfo->right_end = iinfo->left_end; } else { if (iinfo->last_value < iinfo->value_wanted) { if (verbose >= 2) fprintf(stderr,"resetting left end\n"); iinfo->left_end = iinfo->last_position; new_position = iinfo->last_position + interval; if (verbose >= 2) fprintf(stderr," choosing new position (2) %"PRId64"\n",new_position); } /* iinfo->last_value > iinfo->value_wanted */ else { if (verbose >=2) fprintf(stderr,"resetting right end\n"); iinfo->right_end = iinfo->last_position; new_position = iinfo->last_position - interval; if (new_position < 0) new_position = 0; if (verbose >= 2) fprintf(stderr," choosing new position (3) %"PRId64"\n",new_position); } } res = get_first_page_id_after_offset(fin, new_position, pinfo, use_api, use_stub, stubfilename, verbose); if (res >0) { /* caller wants the new value */ iinfo->last_value = pinfo->id; iinfo->last_position = new_position; return(pinfo->id); } else { /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or there's just no block here. set the right end, keep the last value and position and let the caller retry with the new interval */ if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */ iinfo->right_end = new_position; return(iinfo->last_value); } /* in theory we were moving towards beginning of file, should not have issues, so bail here */ else { if (verbose) fprintf(stderr,"something very broken, giving up\n"); return(-1); } } } int main(int argc, char **argv) { int fin, res, page_id=0; off_t file_size; id_info_t pinfo; iter_info_t iinfo; char *filename = NULL; int optindex=0; int use_api = 0; int use_stub = 0; int verbose = 0; int optc; char *stubfile=NULL; struct option optvalues[] = { {"filename", 1, 0, 'f'}, {"help", 0, 0, 'h'}, {"pageid", 1, 0, 'p'}, {"useapi", 0, 0, 'a'}, {"stubfile", 1, 0, 's'}, {"verbose", 0, 0, 'v'}, {"version", 0, 0, 'V'}, {NULL, 0, NULL, 0} }; while (1) { optc=getopt_long_only(argc,argv,"f:hp:as:vV", optvalues, &optindex); if (optc=='f') { filename=optarg; } else if (optc=='p') { if (!(isdigit(optarg[0]))) usage(NULL); page_id=atoi(optarg); } else if (optc=='a') use_api=1; else if (optc=='s') { use_stub=1; stubfile = optarg; } else if (optc=='h') usage(NULL); else if (optc=='v') verbose++; else if (optc=='V') show_version(VERSION); else if (optc==-1) break; else usage("Unknown option or other error\n"); } if (! filename || ! page_id) { usage(NULL); } if (page_id <1) { usage("Please specify a page_id >= 1.\n"); } fin = open (filename, O_RDONLY); if (fin < 0) { fprintf(stderr,"Failed to open file %s for read\n", filename); exit(1); } file_size = get_file_size(fin); pinfo.bits_shifted = -1; pinfo.position = (off_t)-1; pinfo.id = -1; iinfo.left_end = (off_t)0; iinfo.right_end = file_size; iinfo.value_wanted = page_id; res = get_first_page_id_after_offset(fin, (off_t)0, &pinfo, use_api, use_stub, stubfile, verbose); if (res > 0) { iinfo.last_value = pinfo.id; iinfo.last_position = (off_t)0; } else { fprintf(stderr,"Failed to find any page from start of file, exiting\n"); exit(1); } if (pinfo.id == page_id) { if (verbose) fprintf(stderr,"found the page id right away, no iterations needed.\n"); fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.id); exit(0); } if (pinfo.id > page_id) { fprintf(stderr,"Page requested is less than first page id in file\n"); exit(-1); } while (1) { res = do_iteration(&iinfo, fin, &pinfo, use_api, use_stub, stubfile, verbose); if (res < 0) { fprintf(stderr,"Error encountered during search\n"); exit(-1); } else if (iinfo.left_end == iinfo.right_end) { if ( pinfo.id <= page_id) { fprintf(stdout,"position:%"PRId64" page_id:%d\n",pinfo.position, pinfo.id); exit(0); } else { fprintf(stderr,"File does not contain requested page id\n"); exit(-1); } } } exit(0); } diff --git a/xmldumps-backup/mwbzutils/getlastidinbz2xml.c b/xmldumps-backup/mwbzutils/getlastidinbz2xml.c index 6b26bb5..72ad3bb 100644 --- a/xmldumps-backup/mwbzutils/getlastidinbz2xml.c +++ b/xmldumps-backup/mwbzutils/getlastidinbz2xml.c @@ -1,304 +1,304 @@ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mwbzutils.h" void usage(char *message) { char * help = "Usage: getlastidinbz2xml --filename file --type type [--verbose]\n" " [--help] [--version]\n\n" "Show the last page or rev id in the specified MediaWiki XML dump file.\n" "This assumes that the last bz2 block(s) of the file are intact.\n" "Exits with 0 in success, -1 on error.\n\n" "Options:\n\n" " -f, --filename name of file to search\n" " -t, --type type of id to find: 'page' or 'rev'\n" " -v, --verbose show search process; specify multiple times for more output\n" " -h, --help Show this help message\n" " -V, --version Display the version of this program and exit\n\n" "Report bugs in getlastidinbz2xml to .\n\n" "See also dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1),\n" "recompressxml(1), writeuptopageid(1)\n\n"; if (message) { fprintf(stderr,"%s\n\n",message); } fprintf(stderr,"%s",help); exit(-1); } void show_version(char *version_string) { char * copyright = "Copyright (C) 2017 Ariel T. Glenn. All rights reserved.\n\n" "This program is free software: you can redistribute it and/or modify it\n" "under the terms of the GNU General Public License as published by the\n" "Free Software Foundation, either version 2 of the License, or (at your\n" "option) any later version.\n\n" "This program is distributed in the hope that it will be useful, but\n" "WITHOUT ANY WARRANTY; without even the implied warranty of \n" "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" "Public License for more details.\n\n" "You should have received a copy of the GNU General Public License along\n" "with this program. If not, see \n\n" "Written by Ariel T. Glenn.\n"; fprintf(stderr,"getlastidinbz2xml %s\n", version_string); fprintf(stderr,"%s",copyright); exit(-1); } /* if any id of the specified type is found, appropriate updates will be made to id_info no updates are made to the buffer about consumed data, the caller is responsible */ void find_last_id_in_buffer(buf_info_t *buffer, id_info_t *id_info, bz_info_t *bfile, char *type, int verbose) { regmatch_t *match_id; regex_t compiled_id; char *page_id_pattern = "\n[ ]+[^<]+\n([ ]+[0-9]+\n)?[ ]+([0-9]+)\n"; char *rev_id_pattern = "\n[ ]+([0-9]+)\n"; char *match_from ; int index; if (buffer_is_empty(buffer)) return; match_id = (regmatch_t *)malloc(sizeof(regmatch_t)*3); match_from = (char *)buffer->next_to_read; if (! strcmp(type, "rev")) { index = 1; regcomp(&compiled_id, rev_id_pattern, REG_EXTENDED); } else if (! strcmp(type, "page")) { index = 2; regcomp(&compiled_id, page_id_pattern, REG_EXTENDED); } else { fprintf(stderr, "unknown type of tag to find, %s, giving up\n", type); exit(-1); } while (regexec(&compiled_id, match_from, 3, match_id, 0) == 0) { /* found one, yay */ if (match_id[index].rm_so >=0) { id_info->id = atoi((char *)(match_from +match_id[index].rm_so)); id_info->position = bfile->block_start; id_info->bits_shifted = bfile->bits_shifted; /* get ready to search rest of buffer */ match_from += match_id[0].rm_eo; } else { /* should never happen */ fprintf(stderr,"regex gone bad...\n"); exit(-1); } } free(match_id); regfree(&compiled_id); return; } void init_id_info(id_info_t *id_info) { id_info->bits_shifted = -1; id_info->position = (off_t)-1; id_info->id = -1; return; } /* get the last page or rev id after position in file expect position to be the start of a bz2 block if an id tag is found, the structure id_info will be updated accordingly returns: 1 if an id tag found, 0 if no id tag found, -1 on error */ int get_last_id_after_offset(int fin, id_info_t *id_info, bz_info_t *bfile, off_t upto, char *type, int verbose) { int length=5000; /* output buffer size */ buf_info_t *b; const int KEEP = 310; b = init_buffer(length); init_id_info(id_info); /* try to fill the buffer, unless of course we hit eof */ /* could be a case where they read no bytes, more bytes are avail in buffer, we hit eof. what then? */ /* while ((res = get_buffer_of_uncompressed_data(b, fin, bfile, FORWARD) >=0) && (! bfile->eof)) { */ /* while (!get_buffer_of_uncompressed_data(b, fin, bfile, FORWARD) && (! bfile->eof)) { */ while (get_buffer_of_uncompressed_data(b, fin, bfile, FORWARD) >= 0 && (! bfile -> eof)) { find_last_id_in_buffer(b, id_info, bfile, type, verbose); /* did we hit eof? then th-th-that's all folks */ if (bfile->eof) break; /* We keep reading more buffers because we want the _last_ page/rev id, not the first one */ else if (buffer_is_empty(b)) { /* entire buffer is now available for next read */ bfile->strm.next_out = (char *)b->buffer; bfile->strm.avail_out = bfile->bufout_size; b->next_to_fill = b->buffer; } else if (b->bytes_avail> KEEP) { /* dump contents of buffer except last KEEP chars, move those to front so we can keep reading. We keep that much in case somewhere near the end was a page/rev tag or a page/rev id tag that got cut off in the middle. */ move_bytes_to_buffer_start(b, b->end - KEEP, KEEP); bfile->strm.next_out = (char *)b->next_to_fill; bfile->strm.avail_out = b->end - b->next_to_fill; } else { /* move available bytes (don't have KEEP) up to front */ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail); bfile->strm.next_out = (char *)b->next_to_fill; bfile->strm.avail_out = b->end - b->next_to_fill; } if (bfile->position > upto) { /* we're done */ break; } } if (bfile->eof || bfile->position > upto) { /* see what's left in the buffer after eof. maybe we got something good */ find_last_id_in_buffer(b, id_info, bfile, type, verbose); BZ2_bzDecompressEnd(&(bfile->strm)); free_buffer(b); free(b); if (id_info->id == -1) return 0; /* not found */ else if (id_info->id > 0) return 1; /* found */ else return(-1); /* error */ } else { /* we have an error from get_buffer_of_uncompressed_data */ BZ2_bzDecompressEnd(&(bfile->strm)); free_buffer(b); free(b); fprintf(stderr,"freed buffer\n"); return(-1); /* error */ } } int giveup(int fin) { fprintf(stderr,"Failed to find any id tags in file, exiting\n"); close(fin); exit(1); } int main(int argc, char **argv) { int fin, res, id=0; off_t block_end, block_start, upto; id_info_t id_info; char *filename = NULL; char *type = NULL; int optindex=0; bz_info_t bfile; int verbose = 0; int optc; int result; struct option optvalues[] = { {"filename", 1, 0, 'f'}, {"type", 1, 0, 't'}, {"verbose", 0, 0, 'v'}, {"version", 0, 0, 'V'}, {NULL, 0, NULL, 0} }; while (1) { optc = getopt_long_only(argc,argv,"f:hvV", optvalues, &optindex); if (optc=='f') { filename=optarg; } else if (optc == 't') { type = optarg; } else if (optc == 'h') usage(NULL); else if (optc == 'v') verbose++; else if (optc == 'V') show_version(VERSION); else if (optc == -1) break; else usage("Unknown option or other error\n"); } if (! filename) { usage(NULL); } fin = open (filename, O_RDONLY); if (fin < 0) { fprintf(stderr,"Failed to open file %s for read\n", filename); exit(1); } bfile.file_size = get_file_size(fin); bfile.footer = init_footer(); bfile.marker = init_marker(); result = check_file_for_footer(fin, &bfile); if (result == -1) { bfile.position = bfile.file_size; } else { bfile.position = bfile.file_size - (off_t)11; /* size of footer, perhaps with 1 byte extra */ } bfile.position -=(off_t)6; /* size of marker */ bfile.initialized = 0; bfile.bytes_read = 0; /* start at end of file */ block_end = bfile.position; upto = block_end; block_start = (off_t)-1; id = 0; while (!id) { bfile.initialized = 0; init_decompress(&bfile); - block_start = find_first_bz2_block_from_offset(&bfile, fin, block_end, BACKWARD); + block_start = find_first_bz2_block_from_offset(&bfile, fin, block_end, BACKWARD, bfile.file_size, 1); if (block_start <= (off_t) 0) giveup(fin); BZ2_bzDecompressEnd (&(bfile.strm)); res = get_last_id_after_offset(fin, &id_info, &bfile, upto, type, verbose); if (res > 0) { id = id_info.id; } else { upto = block_end; block_end = block_start - (off_t) 1; if (block_end <= (off_t) 0) giveup(fin); } BZ2_bzDecompressEnd (&(bfile.strm)); } if (!id) giveup(fin); fprintf(stdout, "%s_id:%d\n", type, id); close(fin); exit(0); } diff --git a/xmldumps-backup/mwbzutils/mwbzlib.c b/xmldumps-backup/mwbzutils/mwbzlib.c index 76c6ce0..65fce7e 100644 --- a/xmldumps-backup/mwbzutils/mwbzlib.c +++ b/xmldumps-backup/mwbzutils/mwbzlib.c @@ -1,705 +1,741 @@ #include #include #include #include #include #include #include #include #include #include #include #include "bzlib.h" #include "mwbzutils.h" - /* return n ones either at left or right end */ int bit_mask(int numbits, int end) { if (end == MASKRIGHT) { return((1<> (8-numbits) ) ); } } } void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) { int i; for (i=buflen-1; i>=0; i--) { /* right 1 */ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits); /* grab rightmost from prev byte */ if (i > 0) { buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT))); } } } unsigned char ** init_marker() { unsigned char **marker = malloc(8*sizeof(unsigned char *)); int i; /* set up block marker plus its various right-shifted incarnations */ for (i = 0; i< 8; i++) { marker[i] = malloc(sizeof(unsigned char)*7); } marker[0][0]= (unsigned char) 0x31; marker[0][1]= (unsigned char) 0x41; marker[0][2]= (unsigned char) 0x59; marker[0][3]= (unsigned char) 0x26; marker[0][4]= (unsigned char) 0x53; marker[0][5]= (unsigned char) 0x59; marker[0][6]= (unsigned char) 0x00; for (i = 1; i< 8; i++) { memcpy((char *)(marker[i]), (char *)(marker[i-1]),7); shift_bytes_right(marker[i],7,1); } return(marker); } /* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1, both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2 matches and 0 otherwise. */ int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) { int i; if (bitsrightshifted == 0) { for (i = 0; i< numbytes; i++) { if (buff1[i] != buff2[i]) { return(1); } } return(0); } else { for (i = 1; i< numbytes-2; i++) { if (buff1[i] != buff2[i]) { return(1); } } /* do leftmost byte */ if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) { return(1); } /* do rightmost byte */ if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) { return(1); } return(0); } } void dump_bfile_info(bz_info_t *bfile) { fprintf(stderr, "bfile->bufin_size: %d\n", bfile->bufin_size); fprintf(stderr, "bfile->buffout_size: %d\n", bfile->bufout_size); fprintf(stderr, "bfile->initialized: %d\n", bfile->initialized); fprintf(stderr, "bfile->block_start: %"PRId64"\n", bfile->block_start); fprintf(stderr, "bfile->something: %d\n", bfile->bits_shifted); fprintf(stderr, "bfile->position: %"PRId64"\n", bfile->position); fprintf(stderr, "bfile->bytes_written: %d\n", bfile->bytes_written); fprintf(stderr, "bfile->eof: %d\n", bfile->eof); fprintf(stderr, "bfile->file_size: %"PRId64"\n", bfile->file_size); } /* return -1 if no match return number of bits rightshifted otherwise */ int check_buffer_for_bz2_block_marker(bz_info_t *bfile) { int result, i; /* dump_bfile_info(bfile); */ - result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0); + result = bytes_compare(bfile->marker[0],bfile->marker_buffer_ptr+1,6,0); if (!result) { return(0); } for (i=1; i<8; i++) { - result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i); + result = bytes_compare(bfile->marker[i],bfile->marker_buffer_ptr,7,i); if (!result) { return(i); } } return(-1); } /* return: 1 if found, 0 if not, -1 on error */ int find_next_bz2_block_marker(int fin, bz_info_t *bfile, int direction) { off_t seekresult; - int res; + /* int res; */ + ssize_t bytes_read = 0; + ssize_t bytes_avail = 0; + ssize_t bytes_used = 0; bfile->bits_shifted = -1; - res = read(fin, bfile->marker_buffer, 7); - if (res == -1) { + bytes_read = read(fin, bfile->marker_buffer, sizeof(bfile->marker_buffer)); + if (bytes_read == -1) { fprintf(stderr,"read of file failed\n"); return(-1); } + if (bytes_read < 7) + return(-1); + bytes_avail = bytes_read; + bytes_used = 0; + bfile->marker_buffer_ptr = bfile->marker_buffer; /* must be after 4 byte file header, and we add a leftmost byte to the buffer of data read in case some bits have been shifted into it */ while (bfile->position <= bfile->file_size - 6 && bfile->position >= 0 && bfile->bits_shifted < 0) { bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile); if (bfile->bits_shifted < 0) { if (direction == FORWARD) { bfile->position++; + bytes_used++; + bytes_avail--; + if (bytes_avail < 7) { + /* copy the leftovers to the front of the buffer, fill in as much of the remainder as we can, + reset the pointer to the beginning of the buffer for new checks */ + memmove(bfile->marker_buffer, bfile->marker_buffer + sizeof(bfile->marker_buffer) - bytes_avail, bytes_avail); + bytes_read = read(fin, bfile->marker_buffer + bytes_avail, sizeof(bfile->marker_buffer) - bytes_avail); + bytes_used = 0; + if (bytes_read < 1) + return(-1); + bytes_avail += bytes_read; + if (bytes_avail < 7) + return(-1); + bfile->marker_buffer_ptr = bfile->marker_buffer; + } + else { + bfile->marker_buffer_ptr += 1; + } } else { bfile->position--; - } - seekresult = lseek(fin, bfile->position, SEEK_SET); - if (seekresult == (off_t)-1) { - fprintf(stderr,"lseek of file to %"PRId64" failed (2)\n",bfile->position); - return(-1); - } - res = read(fin, bfile->marker_buffer, 7); - if (res < 7) { - return(-1); + /* we are not clever about searching for block markers walking backwards through + the file. if we need to be, code can be written then. */ + seekresult = lseek(fin, bfile->position, SEEK_SET); + if (seekresult == (off_t)-1) { + fprintf(stderr,"lseek of file to %"PRId64" failed (2)\n",bfile->position); + return(-1); + } } } else { bfile->block_start = bfile->position; return(1); } } return(0); } /* initializes the bz2 strm structure, calls the BZ2 decompression library initializer returns: BZ_OK on success various BZ_ errors on failure (see bzlib.h) */ int init_decompress(bz_info_t *bfile) { int bz_verbosity = 0; int bz_small = 0; int ret; bfile->strm.bzalloc = NULL; bfile->strm.bzfree = NULL; bfile->strm.opaque = NULL; ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small ); if (ret != BZ_OK) { fprintf(stderr,"uncompress failed, err %d\n", ret); return(ret); } return(ret); } /* FIXME do this right. whatever. */ off_t get_file_size(int fin) { off_t seekresult; seekresult = lseek(fin, (off_t)0, SEEK_END); if (seekresult == (off_t)-1) { fprintf(stderr,"lseek of file to 0 failed (6)\n"); } return(seekresult); } /* reads the first 4 bytes from a bz2 file (should be "BZh" followed by the block size indicator, typically "9") and passes them into the BZ2 decompression library. This must be done before decompression of any block of the file is attempted. returns: BZ_OK if successful, various BZ_ errors or -1 on failure (see bzlib.h) */ int decompress_header(int fin, bz_info_t *bfile) { int res; off_t seekresult; - seekresult = lseek(fin,(off_t)0,SEEK_SET); - if (seekresult == (off_t)-1) { - fprintf(stderr,"lseek of file to 0 failed (3)\n"); - return(-1); - } - bfile->bytes_read = read(fin, bfile->header_buffer, 4); - if (bfile->bytes_read < 4) { - fprintf(stderr,"failed to read 4 bytes of header\n"); - return(-1); + if (!(bfile->header_read)) { + seekresult = lseek(fin,(off_t)0,SEEK_SET); + if (seekresult == (off_t)-1) { + fprintf(stderr,"lseek of file to 0 failed (3)\n"); + return(-1); + } + bfile->bytes_read = read(fin, bfile->header_buffer, 4); + if (bfile->bytes_read < 4) { + fprintf(stderr,"failed to read 4 bytes of header\n"); + return(-1); + } + bfile->header_read = 1; } bfile->strm.next_in = (char *)bfile->header_buffer; bfile->strm.avail_in = 4; res = BZ2_bzDecompress_mine ( &(bfile->strm) ); if (BZ_OK != res && BZ_STREAM_END != res) { fprintf(stderr,"Corrupt bzip2 header\n"); return(-1); } return(res); } /* seek to appropriate offset as specified in bfile, read compressed data into buffer indicated by bfile, update the bfile structure accordingly, save the overflow byte (bit-shifted data = suck) this is for the *first* buffer of data in a stream, for subsequent buffers use fill_buffer_to_decompress() this will set bfile->eof on eof. no other indicator will be provided. returns: 0 on success -1 on error */ int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) { off_t seekresult; if (bfile->bits_shifted == 0) { seekresult = lseek(fin,bfile->position+(off_t)1,SEEK_SET); if (seekresult == -1) { fprintf(stderr,"lseek of file to %"PRId64" failed (4)\n",bfile->position+(off_t)1); return(-1); } } else { seekresult = lseek(fin,bfile->position,SEEK_SET); if (seekresult == -1) { fprintf(stderr,"lseek of file to %"PRId64" failed (5)\n",bfile->position); return(-1); } } bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size); if (bfile->bytes_read > 0) { bfile->overflow = bfile->bufin[bfile->bytes_read-1]; shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted); bfile->strm.next_in = (char *)(bfile->bufin); bfile->strm.avail_in = bfile->bytes_read-1; } if (bfile->bytes_read <=0) { bfile->eof++; } return(0); } /* set up the marker, seek to right place, get first buffer of compressed data for processing bfile->position must be set to desired offset first by caller. returns: -1 if no marker or other error, position of next read if ok */ int init_bz2_file(bz_info_t *bfile, int fin, int direction) { off_t seekresult; bfile->bufin_size = BUFINSIZE; if (bfile->marker == NULL) bfile->marker = init_marker(); bfile->bytes_read = 0; bfile->bytes_written = 0; bfile->eof = 0; + bfile->header_read = 0; bfile->initialized++; bfile->file_size = get_file_size(fin); if (bfile->position > bfile->file_size) { fprintf(stderr,"asked for position past end of file\n"); return(-1); } seekresult = lseek(fin, bfile->position, SEEK_SET); if (seekresult == (off_t)-1) { fprintf(stderr,"lseek of file to %"PRId64" failed (9)\n",bfile->position); return(-1); } find_next_bz2_block_marker(fin, bfile, direction); if (bfile->bits_shifted >= 0) { init_decompress(bfile); decompress_header(fin, bfile); setup_first_buffer_to_decompress(fin, bfile); return(0); } return(-1); } /* read compressed data into buffer indicated by bfile, from current position of file, stuffing the overflow byte in first. update the bfile structure accordingly save the new overflow byte (bit-shifted data = suck) this function is for decompression of buffers *after the first one*. for the first one use setup_first_buffer_to_decompress() this will set bfile->eof on eof. no other indicator will be provided. returns: 0 on success hmm, it really does not do anything about errors :-D */ int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) { if (bfile->strm.avail_in == 0) { bfile->strm.next_in = (char *)(bfile->bufin); bfile->bufin[0] = bfile->overflow; bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1); if (bfile->bytes_read > 0) { bfile->position+=bfile->bytes_read; bfile->overflow = bfile->bufin[bfile->bytes_read]; shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted); bfile->strm.avail_in = bfile->bytes_read; } else { /* bfile->bytes_read <= 0 */ bfile->strm.avail_in = 1; /* the overflow byte */ bfile->eof++; } } return(0); } /* size of buffer is bytes usable. there will be a null byte at the end what we do with the buffer: - read from front of buffer to end, - fill from point where prev read did not fill buffer, or from where move of data at end of buffer to beginning left room, - mark a string of bytes (starting from what's available to read) as "read" */ buf_info_t *init_buffer(int size) { buf_info_t *b; b = (buf_info_t *)malloc(sizeof(buf_info_t)); b->buffer = malloc(sizeof(unsigned char)*(size+1)); b->buffer[size]='\0'; b->end = b->buffer + size; b->next_to_read = b->end; /* nothing available */ b->bytes_avail = 0; /* bytes to read, nothing available */ b->next_to_fill = b->buffer; /* empty */ b->next_to_fill[0] = '\0'; return(b); } /* free pieces of buf_info_t */ void free_buffer(buf_info_t *b) { if (b) { if (b->buffer) { free(b->buffer); } } return; } /* check if buffer (used for decompressed data output) is empty, returns 1 if so and 0 if not */ int buffer_is_empty(buf_info_t *b) { if (b->bytes_avail == 0) { return(1); } else { return(0); } } /* check if buffer (used for decompressed data output) is full, returns 1 if so and 0 if not I'm not liking this function so well, fixme */ int buffer_is_full(buf_info_t *b) { if (b->next_to_fill == b->end) { return(1); } else { return(0); } } /* get the next buffer of uncompressed stuff */ int get_and_decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size, int direction) { int ret; bfile->bufout = bufferout; bfile->bufout_size = bufout_size; bfile->bytes_written = 0; if (! bfile->initialized) { if (init_bz2_file(bfile, fin, direction) == -1) { fprintf(stderr,"failed to initialize bz2file\n"); return(-1); }; bfile->strm.next_out = (char *)bfile->bufout; bfile->strm.avail_out = bfile->bufout_size; } ret = BZ_OK; while (BZ_OK == ret && bfile->bytes_written == 0) { ret = BZ2_bzDecompress_mine ( &(bfile->strm) ); if (BZ_OK == ret || BZ_STREAM_END == ret) { bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout; } else { fprintf(stderr,"error from BZ decompress %d (1)\n",ret); return(-1); } fill_buffer_to_decompress(fin, bfile, ret); } if (ret == BZ_STREAM_END) { bfile->eof++; } return(0); } /* fill output buffer in b with uncompressed data from bfile if this is the first call to the function for this file, the file header will be read, and the first buffer of uncompressed data will be prepared. bfile->position should be set to the offset (from the beginning of file) from which to find the first bz2 block. returns: on success, number of bytes read (may be 0) -1 on error */ int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile, int direction) { int res; if (buffer_is_full(b)) { return(0); } if (buffer_is_empty(b)) { b->next_to_fill = b->buffer; } res = get_and_decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill, direction); if (res <0 ) { return(res); } if (bfile->bytes_written < 0) { fprintf(stderr,"read of file failed\n"); return(-1); } else { /* really?? FIXME check this */ if (buffer_is_empty(b)) { b->next_to_read = b->next_to_fill; /* where we just read */ } b->bytes_avail += bfile->bytes_written; b->next_to_fill += bfile->bytes_written; b->next_to_fill[0] = '\0'; return(0); } } void dumpbuf_info_t(buf_info_t *b) { fprintf(stderr, "\n"); fprintf(stderr, "b->buffer: %ld\n", (long int) b->buffer); fprintf(stderr, "b->end: %ld\n", (long int) b->end); fprintf(stderr, "b->next_to_read: %ld\n", (long int) b->next_to_read); fprintf(stderr, "b->next_to_fill: %ld\n", (long int) b->next_to_fill); fprintf(stderr, "b->bytes_avail: %ld\n", (long int) b->bytes_avail); } /* copy text from end of buffer to the beginning, that we want to keep around for further processing (i.e. further regex matches) returns number of bytes copied */ int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes) { int i, tocopy; if (fromwhere >= b->end) { return(0); } else { tocopy = b->end - fromwhere; if (maxbytes && (tocopy > maxbytes)) { tocopy = maxbytes; } for (i = 0; i < tocopy; i++) { b->buffer[i] = fromwhere[i]; } b->next_to_fill = b->buffer + tocopy; b->next_to_fill[0] = '\0'; b->next_to_read = b->buffer; b->bytes_avail = tocopy; return(tocopy); } } unsigned char ** init_footer() { unsigned char **footer = malloc(8*sizeof(unsigned char *)); int i; /* set up footer plus its various right-shifted incarnations */ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */ for (i = 0; i< 8; i++) { footer[i] = malloc(sizeof(unsigned char)*7); } footer[0][0]= (unsigned char) 0x17; footer[0][1]= (unsigned char) 0x72; footer[0][2]= (unsigned char) 0x45; footer[0][3]= (unsigned char) 0x38; footer[0][4]= (unsigned char) 0x50; footer[0][5]= (unsigned char) 0x90; footer[0][6]= (unsigned char) 0x00; for (i = 1; i< 8; i++) { memcpy((char *)(footer[i]), (char *)(footer[i-1]),7); shift_bytes_right(footer[i],7,1); } return(footer); } int read_footer(unsigned char *buffer, int fin) { off_t seekresult; int res; seekresult = lseek(fin, (off_t)-11, SEEK_END); if (seekresult == (off_t)-1) { fprintf(stderr,"lseek of file failed\n"); return(-1); } res = read(fin, buffer, 11); if (res == -1) { fprintf(stderr,"read of file failed\n"); return(-1); } return(0); } /* return number of bits rightshifted otherwise -1 if no match */ int check_file_for_footer(int fin, bz_info_t *bfile) { unsigned char buffer[11]; int result, i; read_footer(buffer,fin); result = bytes_compare(bfile->footer[0],buffer+1,6,0); if (!result) { return(0); } for (i=1; i<8; i++) { result = bytes_compare(bfile->footer[i],buffer,7,i); if (!result) { return(i); } } return(-1); } void clear_buffer(unsigned char *buf, int length) { int i; for (i=0; iposition will contain the current position of the file (? will it?) + bfile->position will contain the current position of the file and the fle + cursor will be set via lseek to the start of the found block, if do_seek is nonzero bfile->bits_shifted will contain the number of bits that the block is rightshifted bfile->block_start will contain the offset from start of file to the block (this value will always be positive, the value given in the argument "direction" determines whether the block starts before or after the initial file position). returns: position of next byte in file to be read, on success 0 if no marker -1 on error */ -off_t find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, off_t position, int direction) { +off_t find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, off_t position, + int direction, off_t filesize, int do_seek) { off_t seekresult; int res; unsigned char buffout[5000]; bfile->bufin_size = BUFINSIZE; if (bfile->marker == NULL) bfile->marker = init_marker(); bfile->position = position; bfile->block_start = (off_t)-1; bfile->bytes_read = 0; bfile->bytes_written = 0; bfile->eof = 0; bfile->bits_shifted = -1; bfile->bufout = buffout; bfile->bufout_size = 5000; - bfile->file_size = get_file_size(fin); + if (filesize) + bfile->file_size = filesize; + else + bfile->file_size = get_file_size(fin); while (bfile->bits_shifted < 0) { if (bfile->position > bfile->file_size) { return(0); } seekresult = lseek(fin, bfile->position, SEEK_SET); if (seekresult == (off_t)-1) { fprintf(stderr,"lseek of file to %"PRId64" failed (7)\n",bfile->position); return(-1); } - res = find_next_bz2_block_marker(fin, bfile,direction); + res = find_next_bz2_block_marker(fin, bfile, direction); if (res == 1) { init_decompress(bfile); decompress_header(fin, bfile); res = setup_first_buffer_to_decompress(fin, bfile); if (res == -1) { fprintf(stderr,"couldn't get first buffer of data to uncompress\n"); return(-1); } bfile->strm.next_out = (char *)bfile->bufout; bfile->strm.avail_out = bfile->bufout_size; res = BZ2_bzDecompress_mine ( &(bfile->strm) ); /* this means we (probably) have a genuine marker */ if (BZ_OK == res || BZ_STREAM_END == res) { res = BZ2_bzDecompressEnd ( &(bfile->strm) ); bfile->bytes_read = 0; bfile->bytes_written = 0; bfile->eof = 0; - /* leave the file at the right position */ - seekresult = lseek(fin, bfile->block_start, SEEK_SET); - if (seekresult == (off_t)-1) { - fprintf(stderr,"lseek of file to %"PRId64" failed (8)\n",bfile->position); - return(-1); + if (do_seek) { + /* leave the file at the right position */ + seekresult = lseek(fin, bfile->block_start, SEEK_SET); + if (seekresult == (off_t)-1) { + fprintf(stderr,"lseek of file to %"PRId64" failed (8)\n",bfile->position); + return(-1); + } + bfile->position = seekresult; + return(bfile->position); } - bfile->position = seekresult; - return(bfile->position); + else + return(bfile->block_start); } /* right bytes, but there by chance, skip and try again */ else { bfile->position+=(off_t)6; bfile->bits_shifted = -1; bfile->block_start = (off_t)-1; } } else { return(0); } } return(-1); } diff --git a/xmldumps-backup/mwbzutils/mwbzutils.h b/xmldumps-backup/mwbzutils/mwbzutils.h index 0f96058..702dc90 100644 --- a/xmldumps-backup/mwbzutils/mwbzutils.h +++ b/xmldumps-backup/mwbzutils/mwbzutils.h @@ -1,141 +1,143 @@ #ifndef _MWBZUTILS_H #define _MWBZUTILS_H #include "bzlib_private.h" int BZ_API(BZ2_bzDecompress_mine) ( bz_stream *strm ); typedef struct { int id; /* first id in the block */ int bits_shifted; /* block is right shifted this many bits */ off_t position; /* position in file of block */ } id_info_t; #define BUFINSIZE 5000 /* keeps all information about a bzipped file plus input/output buffers for decompression */ typedef struct { unsigned char bufin[BUFINSIZE]; /* compressed data read from file */ unsigned char *bufout; /* uncompressed data, must be allocated by caller */ - unsigned char marker_buffer[7]; /* data to test for bz2 block marker */ + unsigned char marker_buffer[512]; /* data to test for bz2 block marker */ + unsigned char *marker_buffer_ptr; /* pointer into the above buffer for next string of bytes to check */ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */ - + int header_read; /* set if the bz2 header for this file has been read into the header buffer */ int bufin_size; /* size of input buffer for compressed data */ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */ int initialized; /* whether bz2file has been initialized (header processed, seek to some bz2 block in the file and input buffer filled) */ off_t block_start; /* position of bz2 block in file from which we started to read (we read a sequence of bz2 blocks from a given position, this is the offset to the first one) */ - + unsigned char block_info[12]; /* block marker and crc bytes, possibly bit-shifted, for the current block */ bz_stream strm; /* stream structure for libbz2 */ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte read around so we can grab the lower end bits off the end for sticking in front of the next pile of compressed bytes we read */ int bits_shifted; /* number of bits that the compressed data has been right shifted in the file (if the number is 0, the block marker and subsequent data is byte-aligned) */ unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for locating the marker in a stream of compressed data */ unsigned char **footer; /* bzip2 end of stream footer, plus bit-shifted versions of it for locating the footer in a stream of compressed data */ off_t position; /* current offset into file from start of file */ int bytes_read; /* number of bytes of compressed data read from file (per read) */ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */ int eof; /* nonzero if eof reached */ off_t file_size; /* length of file, so we don't search past it for blocks */ } bz_info_t; #define MASKLEFT 0 #define MASKRIGHT 1 /* this output buffer is used to collect decompressed output. this is not a circular buffer; when it is full the user is responsible for emptying it completely or partially and moving to the beginning any unused bytes. */ typedef struct { unsigned char *buffer; /* output storage, allocated by the caller */ unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */ unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */ int bytes_avail; /* number of bytes available for reading */ unsigned char *end; /* points to byte after end of buffer */ } buf_info_t; /* used for each iteration of narrowing down the location in a bzipped2 file of a desired pageid, by finding first compressed block after a guessed position and checking the first pageid (if any) contained in it. */ typedef struct { off_t left_end; /* left end of interval to search (bytes from start of file) */ off_t right_end; /* right end of interval to search */ int value_wanted; /* pageid desired */ int last_value; /* pageid we found in last iteration */ off_t last_position; /* position in file for last iteration */ } iter_info_t; int bit_mask(int numbits, int end); void shift_bytes_left(unsigned char *buffer, int buflen, int numbits); void shift_bytes_right(unsigned char *buffer, int buflen, int numbits); unsigned char ** init_marker(); int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted); int check_buffer_for_bz2_block_marker(bz_info_t *bfile); #define FORWARD 1 #define BACKWARD 2 int find_next_bz2_block_marker(int fin, bz_info_t *bfile, int direction); int init_decompress(bz_info_t *bfile); int decompress_header(int fin, bz_info_t *bfile); int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile); int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret); buf_info_t *init_buffer(int size); void free_buffer(buf_info_t *b); int buffer_is_empty(buf_info_t *b); int buffer_is_full(buf_info_t *b); off_t get_file_size(int fin); int init_bz2_file(bz_info_t *bfile, int fin, int direction); int get_and_decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size, int direction); int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile, int direction); void dump_buf_info(buf_info_t *b); int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes); unsigned char ** init_footer(); int read_footer(unsigned char *buffer, int fin); int check_file_for_footer(int fin, bz_info_t *bfile); void clear_buffer(unsigned char *buf, int length); -off_t find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, off_t position, int direction); +off_t find_first_bz2_block_from_offset(bz_info_t *bfile, int fin, off_t position, + int direction, off_t filesize, int do_seek); #endif diff --git a/xmldumps-backup/mwbzutils/showcrcs.c b/xmldumps-backup/mwbzutils/showcrcs.c new file mode 100644 index 0000000..c51fe53 --- /dev/null +++ b/xmldumps-backup/mwbzutils/showcrcs.c @@ -0,0 +1,261 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mwbzutils.h" + +/* stolen from lbzip2 */ +#define combine_crc(cc,c) (((cc) << 1) ^ ((cc) >> 31) ^ (c) ^ -1) + +void usage(char *message) { + char * help = +"Usage: showcrcs --filename file\n" +" [--verbose] [--help] [--version]\n\n" +"Show the offsets of all bz2 blocks in file, in order, along with their crcs.\n" +"Blocks are detected by checking for start of block markers and doing partial\n" +"decompression to be sure that the marker is not just part of some compressed\n" +"data.\n\n" +"Options:\n\n" +" -f, --filename name of file to search\n" +" -v, --verbose Show processing messages\n" +" -h, --help Show this help message\n" +" -V, --version Display the version of this program and exit\n\n" +"Report bugs in showcrcs to .\n\n" +"See also dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1),\n" + "recompressxml(1), writeuptopageid(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} + +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2019 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see \n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"showcrcs %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); +} + +/* + find the first bz2 block marker in the file, + from its current position, + then set up for decompression from that point + returns: + 0 on success + -1 if no marker or other error + */ +void init_bz2_info(bz_info_t *bfile, int fin) { + bfile->bufin_size = BUFINSIZE; + bfile->marker = init_marker(); + bfile->bytes_read = 0; + bfile->bytes_written = 0; + bfile->eof = 0; + bfile->file_size = get_file_size(fin); + bfile->header_read = 0; + + bfile->initialized++; +} + +void show_crc(unsigned char *otherbuffer, int fin, off_t block_start, int bits_shifted, uint64_t *block_crc, int verbose) { + uint64_t crc = (uint64_t)0; + unsigned char buffer[5]; + off_t seekres; + int res = 0; + + /* block marker is 6 bytes long, if it's bit-shifted then some bits of the crc + will be in the 6th byte, otherwise only (byte-aligned) in the 7th*/ + if (bits_shifted) + seekres = lseek(fin, block_start + (off_t)6, SEEK_SET); + else + seekres = lseek(fin, block_start + (off_t)7, SEEK_SET); + if (seekres == (off_t)-1) { + fprintf(stderr,"lseek of file failed\n"); + exit(1); + } + /* we need the next 4 bytes for the crc, 5 if we have bit-shifting so just get 5 */ + res = read(fin, buffer, 5); + if (res == -1) { + fprintf(stderr,"read of file failed\n"); + exit(-1); + } + if (verbose) + fprintf(stdout, "buffer: %02x %02x %02x %02x %02x\n", buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]); + crc += (uint64_t) buffer[0] & bit_mask(8 - bits_shifted, MASKRIGHT); + crc = crc << 8; + if (verbose > 1) + fprintf(stdout, "crc with buffer[0] and shifted: 0x%lx\n", crc); + crc += (uint64_t) buffer[1]; + crc = crc << 8; + if (verbose > 1) + fprintf(stdout, "crc with buffer[1] and shifted: 0x%lx\n", crc); + crc += (uint64_t) buffer[2]; + crc = crc << 8; + if (verbose > 1) + fprintf(stdout, "crc with buffer[2] and shifted: 0x%lx\n", crc); + crc += (uint64_t) buffer[3]; + if (bits_shifted) { + if (verbose) + fprintf(stdout, "block crc bits shifted by %d\n", bits_shifted); + crc = crc << bits_shifted; + if (verbose > 1) + fprintf(stdout, "crc with buffer[3] and shifted: 0x%lx\n", crc); + crc += (uint64_t) (buffer[4] & bit_mask(bits_shifted, MASKLEFT)) >> (8 - bits_shifted); + } + crc &= 0xffffffff; + fprintf(stdout, "CRC:0x%08lx\n", crc); + *block_crc = crc; +} + +/* + from current point in the file, find the next bz2 block and display + crc/offset information + */ +off_t do_next_block(bz_info_t *bfile, int fin, off_t offset, uint64_t *block_crc, off_t filesize, int verbose) { + offset = find_first_bz2_block_from_offset(bfile, fin, offset, FORWARD, filesize, 0); + if (!offset) { + return(0); + } + else if (offset > (off_t)0) { + fprintf(stdout, "offset:%"PRId64" ", offset); + show_crc(bfile->block_info, fin, bfile->block_start, bfile->bits_shifted, block_crc, verbose); + return(offset); + } + else { + fprintf(stderr,"Failed to find the next block marker due to some error\n"); + exit(-1); + } +} + +void show_stream_crc(bz_info_t *bfile, int fin, int verbose) { + /* + find the stream crc from the bzip2 footer at the + end of the file and display it + */ + int bits_shifted = 0; + uint64_t stream_crc = (uint64_t)0; + unsigned char buffer[12]; + int ind = 0; + + bfile->footer = init_footer(); + bits_shifted = check_file_for_footer(fin, bfile); + if (bits_shifted == -1) { + fprintf(stderr, "failed to find bz2 footer\n"); + exit(1); + } + read_footer(buffer, fin); + if (verbose) + fprintf(stdout, "buffer: %02x %02x %02x %02x %02x\n", buffer[6], buffer[7], buffer[8], buffer[9], buffer[10]); + + if (bits_shifted) + ind = 6; + else + ind = 7; + stream_crc += (uint64_t) buffer[ind++] & bit_mask(8 - bits_shifted, MASKRIGHT); + stream_crc = stream_crc << 8; + stream_crc += (uint64_t) buffer[ind++]; + stream_crc = stream_crc << 8; + stream_crc += (uint64_t) buffer[ind++]; + stream_crc = stream_crc << 8; + stream_crc += (uint64_t) buffer[ind++]; + if (bits_shifted) { + if (verbose) + fprintf(stdout, "stream_crc bits shifted by %d\n", bits_shifted); + stream_crc = stream_crc << bits_shifted; + stream_crc += (uint64_t) (buffer[ind++] & bit_mask(bits_shifted, MASKLEFT)) >> (8 - bits_shifted); + } + stream_crc &= 0xffffffff; + fprintf(stdout, "extracted_stream_CRC:0x%lx\n", stream_crc); +} + +int main(int argc, char **argv) { + int fin; + char *filename = NULL; + int verbose = 0; + int optindex=0; + int optc; + bz_info_t bfile; + off_t offset = (off_t)0; + off_t filesize = (off_t)0; + uint64_t block_crc = 0u; + uint64_t computed_cumul_crc = 0u; + + struct option optvalues[] = { + {"filename", 1, 0, 'f'}, + {"help", 0, 0, 'h'}, + {"verbose", 0, 0, 'v'}, + {"version", 0, 0, 'V'}, + {NULL, 0, NULL, 0} + }; + + while (1) { + optc=getopt_long_only(argc,argv,"f:hvV", optvalues, &optindex); + if (optc=='f') { + filename=optarg; + } + else if (optc=='h') + usage(NULL); + else if (optc=='v') + verbose++; + else if (optc=='V') + show_version(VERSION); + else if (optc==-1) break; + else usage("Unknown option or other error\n"); + } + + if (! filename) { + usage(NULL); + } + + fin = open (filename, O_RDONLY); + if (fin < 0) { + fprintf(stderr,"Failed to open file %s for read\n", filename); + exit(1); + } + + bfile.initialized = 0; + bfile.marker = NULL; + + init_bz2_info(&bfile, fin); + filesize = get_file_size(fin); + + while (1) { + offset = do_next_block(&bfile, fin, offset, &block_crc, filesize, verbose); + if (!offset) + break; + offset += (off_t)1; + if (verbose) { + fprintf(stdout, "1's complement of block crc: 0x%lx\n", block_crc ^ 0xffffffff); + fprintf(stderr, "current cumul crc: 0x%lx, ", computed_cumul_crc); + } + computed_cumul_crc = combine_crc(computed_cumul_crc, (block_crc ^ 0xffffffff)); + computed_cumul_crc &= 0xffffffff; + if (verbose) + fprintf(stderr, " NEW cumul crc: 0x%lx\n", computed_cumul_crc); + } + computed_cumul_crc &= 0xffffffff; + fprintf(stdout, "computed_stream_CRC:0x%lx\n", computed_cumul_crc); + show_stream_crc(&bfile, fin, verbose); + exit(0); +}