diff options
Diffstat (limited to 'extract/Makefile')
-rw-r--r-- | extract/Makefile | 482 |
1 files changed, 482 insertions, 0 deletions
diff --git a/extract/Makefile b/extract/Makefile new file mode 100644 index 00000000..a39d413a --- /dev/null +++ b/extract/Makefile @@ -0,0 +1,482 @@ +# Example commands: +# +# make +# make test +# Runs all tests. +# +# make test-exe +# Runs exe regression tests. These use $(gs) and $(mutool) to generate +# intermediate data about pdf content, then uses $(exe) to convert to +# docx. +# +# make test-mutool +# Runs mutool regression tests. This uses $(mutool) to convert directly +# from pdf to docx. We require that $(mutool) was built with extract=yes. +# +# make test-gs +# Runs gs regression tests. This uses $(gs) to convert directly from pdf +# to docx. We require that $(gs) was built with --with-extract-dir=... We +# also do a simple test of output-file-per-page. +# +# make test-buffer test-misc test-src +# Runs unit tests etc. +# +# make build=debug-opt ... +# Set build flags. +# +# make build=memento msqueeze +# Run memento squeeze test. + + +# Build flags. +# +# Note that OpenBSD's clang-8 appears to ignore -Wdeclaration-after-statement. +# +build = debug + +flags_link = -W -Wall -lm +flags_compile = -W -Wall -Wextra -Wpointer-sign -Wmissing-declarations -Wmissing-prototypes -Wdeclaration-after-statement -Wpointer-arith -Wconversion -Wno-sign-conversion -Werror -MMD -MP + +uname = $(shell uname) + +ifeq ($(build),) + $(error Need to specify build=debug|opt|debug-opt|memento) +else ifeq ($(build),debug) + flags_link += -g + flags_compile += -g +else ifeq ($(build),opt) + flags_link += -O2 + flags_compile += -O2 +else ifeq ($(build),debug-opt) + flags_link += -g -O2 + flags_compile += -g -O2 +else ifeq ($(build),memento) + flags_link += -g -dl + ifeq ($(uname),OpenBSD) + flags_link += -L /usr/local/lib -l execinfo + endif + flags_compile += -g -D MEMENTO +else + $(error unrecognised $$(build)=$(build)) +endif + + +# Locations of mutool and gs. By default we assume these are not available. +# +# If this extract checkout is within a mupdf tree (typically as a git +# submodule) we assume ghostpdl is checked out nearby and both mutool gs and gs +# binaries are available and built with extract enabled. +# +# Disable this by running: make we_are_mupdf_thirdparty= ... +# +we_are_mupdf_thirdparty = $(findstring /mupdf/thirdparty/extract, $(abspath .)) +ifneq ($(we_are_mupdf_thirdparty),) + $(warning we are mupdf thirdparty) + mutool := ../../build/debug-extract/mutool + gs := ../../../ghostpdl/debug-extract-bin/gs + libbacktrace = ../../../libbacktrace/.libs +endif + +# If mutool/gs are specified, they must exist. +# +ifneq ($(mutool),) +ifeq ($(wildcard $(mutool)),) + $(error mutool does not exist: $(mutool)) +endif +$(warning mutool=$(mutool)) +endif + +ifneq ($(gs),) +ifeq ($(wildcard $(gs)),) + $(error gs does not exist: $(gs)) +endif +$(warning gs=$(gs)) +endif + + +# Default target - run all tests. +# +test: test-buffer test-misc test-src test-exe test-mutool test-gs + @echo $@: passed + +# Define the main test targets. +# +# test/Python2clipped.pdf is same as test/Python2.pdf except it as a modified +# MediaBox that excludes some glyphs. +# +pdfs = test/Python2.pdf test/Python2clipped.pdf test/zlib.3.pdf test/text_graphic_image.pdf +pdfs_generated = $(patsubst test/%, test/generated/%, $(pdfs)) + +# Generate targets that check all combinations of mu/gs and the various +# rotate/autosplit options of extract-exe. +# +tests_exe := +ifneq ($(mutool),) + tests_exe := $(tests_exe) $(patsubst %, %.intermediate-mu.xml, $(pdfs_generated)) +endif +ifneq ($(gs),) + tests_exe := $(tests_exe) $(patsubst %, %.intermediate-gs.xml, $(pdfs_generated)) +endif + +tests_exe := \ + $(patsubst %, %.extract.docx, $(tests_exe)) \ + $(patsubst %, %.extract-rotate.docx, $(tests_exe)) \ + $(patsubst %, %.extract-rotate-spacing.docx, $(tests_exe)) \ + $(patsubst %, %.extract-autosplit.docx, $(tests_exe)) \ + $(patsubst %, %.extract-template.docx, $(tests_exe)) \ + +tests_exe := $(patsubst %, %.diff, $(tests_exe)) + +ifneq ($(mutool),) +# Targets that test direct conversion with mutool. +# + tests_mutool := \ + $(patsubst %, %.mutool.docx.diff, $(pdfs_generated)) \ + $(patsubst %, %.mutool-norotate.docx.diff, $(pdfs_generated)) \ + +endif +ifneq ($(gs),) +# Targets that test direct conversion with gs. +# + tests_gs := \ + $(patsubst %, %.gs.docx.diff, $(pdfs_generated)) \ + test_gs_fpp + + # We don't yet do clipping with gs so exclude Python2clipped.pdf.*: + tests_gs := $(filter-out test/generated/Python2clipped.pdf.%, $(tests_gs)) + + #$(warning tests_gs: $(tests_gs)) +endif +#$(warning $(pdfs_generated_intermediate_docx_diffs)) +#$(warning $(tests)) + +test-exe: $(tests_exe) + @echo $@: passed + +# Checks output of mutool conversion from .pdf to .docx. Requires that mutool +# was built with extract as a third-party library. +# +test-mutool: $(tests_mutool) + @echo $@: passed + +# Checks output of gs conversion from .pdf to .docx. Requires that mutool +# was built with extract as a third-party library. As of 2021-02-10 this +# requires, for example ghostpdl/extract being a link to an extract checkout +# and configuring with --with-extract-dir=extract. +# +test-gs: $(tests_gs) + @echo $@: passed + +# Check behaviour of gs when writing file-per-page. +# +test_gs_fpp: $(gs) + @echo + @echo == Testing gs file-page-page + rm test/generated/text_graphic_image.pdf.gs.*.docx || true + $(gs) -sDEVICE=docxwrite -o test/generated/Python2.pdf.gs.%i.docx test/Python2.pdf + rm test/generated/text_graphic_image.pdf.gs.*.docx || true + $(gs) -sDEVICE=docxwrite -o test/generated/zlib.3.pdf.gs.%i.docx test/zlib.3.pdf + rm test/generated/text_graphic_image.pdf.gs.*.docx || true + $(gs) -sDEVICE=docxwrite -o test/generated/text_graphic_image.pdf.gs.%i.docx test/text_graphic_image.pdf + @echo Checking for correct number of generated files. + ls -l test/generated/*.pdf.gs.*.docx + ls test/generated/text_graphic_image.pdf.gs.*.docx | wc -l | grep '^ *1$$' + ls test/generated/Python2.pdf.gs.*.docx | wc -l | grep '^ *1$$' + ls test/generated/zlib.3.pdf.gs.*.docx | wc -l | grep '^ *2$$' + + +# Main executable. +# +exe = src/build/extract-$(build).exe +exe_src = \ + src/alloc.c \ + src/astring.c \ + src/buffer.c \ + src/docx.c \ + src/docx_template.c \ + src/extract-exe.c \ + src/extract.c \ + src/join.c \ + src/mem.c \ + src/outf.c \ + src/xml.c src/zip.c \ + +ifeq ($(build),memento) + exe_src += src/memento.c + ifeq ($(uname),Linux) + flags_compile += -D HAVE_LIBDL + flags_link += -L $(libbacktrace) -l backtrace -l dl + endif +endif +exe_obj = $(patsubst src/%.c, src/build/%.c-$(build).o, $(exe_src)) +exe_dep = $(exe_obj:.o=.d) +exe: $(exe) +$(exe): $(exe_obj) + $(CC) $(flags_link) -o $@ $^ -lz -lm + +run_exe = $(exe) +ifeq ($(build),memento) + ifeq ($(uname),Linux) + run_exe = LD_LIBRARY_PATH=$(libbacktrace) MEMENTO_ABORT_ON_LEAK=1 MEMENTO_HIDE_MULTIPLE_REALLOCS=1 $(exe) + #run_exe = LD_LIBRARY_PATH=../libbacktrace/.libs $(exe) + endif + ifeq ($(uname),OpenBSD) + run_exe = MEMENTO_ABORT_ON_LEAK=1 $(exe) + endif +endif + + +# Rules that make the various intermediate targets required by $(tests). +# + +test/generated/%.pdf.intermediate-mu.xml: test/%.pdf $(mutool) + @echo + @echo == Generating intermediate file for $< with mutool. + @mkdir -p test/generated + $(mutool) draw -F xmltext -o $@ $< + +test/generated/%.pdf.intermediate-gs.xml: test/%.pdf $(gs) + @echo + @echo == Generating intermediate file for $< with gs. + @mkdir -p test/generated + $(gs) -sDEVICE=txtwrite -dTextFormat=4 -o $@ $< + +%.extract.docx: % $(exe) + @echo + @echo == Generating docx with extract.exe + $(run_exe) -r 0 -i $< -o $@ + +%.extract-rotate.docx: % $(exe) Makefile + @echo + @echo == Generating docx with rotation with extract.exe + $(run_exe) -r 1 -s 0 -i $< -o $@ + +%.extract-rotate-spacing.docx: % $(exe) Makefile + @echo + @echo == Generating docx with rotation with extract.exe + $(run_exe) -r 1 -s 1 -i $< -o $@ + +%.extract-autosplit.docx: % $(exe) + @echo + @echo == Generating docx with autosplit with extract.exe + $(run_exe) -r 0 -i $< --autosplit 1 -o $@ + +%.extract-template.docx: % $(exe) + @echo + @echo == Generating docx using src/template.docx with extract.exe + $(run_exe) -r 0 -i $< -t src/template.docx -o $@ + +test/generated/%.docx.diff: test/generated/%.docx.dir/ test/%.docx.dir.ref/ + @echo + @echo == Checking $< + diff -ru $^ + +# This checks that -t src/template.docx gives identical results. +# +test/generated/%.extract-template.docx.diff: test/generated/%.extract-template.docx.dir/ test/%.extract.docx.dir.ref/ + @echo + @echo == Checking $< + diff -ru $^ + +# Unzips .docx into .docx.dir/ directory. Note that we requires a trailing '/' +# in target. +# +%.docx.dir/: %.docx + @echo + @echo == Extracting .docx into directory. + @rm -r $@ 2>/dev/null || true + unzip -q -d $@ $< + +# Uses zip to create .docx file by zipping up a directory. Useful to recreate +# .docx from reference directory test/*.docx.dir.ref. +%.docx: % + @echo + @echo == Zipping directory into .docx file. + @rm -r $@ 2>/dev/null || true + cd $< && zip -r ../$(notdir $@) . + +# Prettifies each .xml file within .docx.dir/ directory. +%.docx.dir.pretty: %.docx.dir/ + @rm -r $@ $@- 2>/dev/null || true + cp -pr $< $@- + ./src/docx_template_build.py --docx-pretty $@- + mv $@- $@ + +# Converts .pdf directly to .docx using mutool. +test/generated/%.pdf.mutool.docx: test/%.pdf $(mutool) + @echo + @echo == Converting .pdf directly to .docx using mutool. + @mkdir -p test/generated + $(mutool) convert -O mediabox-clip=yes -o $@ $< + +test/generated/%.pdf.mutool-norotate.docx: test/%.pdf $(mutool) + @echo + @echo == Converting .pdf directly to .docx using mutool. + @mkdir -p test/generated + $(mutool) convert -O mediabox-clip=yes,rotation=no -o $@ $< + +test/generated/%.pdf.mutool-spacing.docx: test/%.pdf $(mutool) + @echo + @echo == Converting .pdf directly to .docx using mutool. + @mkdir -p test/generated + $(mutool) convert -O mediabox-clip=yes,spacing=yes -o $@ $< + +# Converts .pdf directly to .docx using gs. +test/generated/%.pdf.gs.docx: test/%.pdf $(gs) + @echo + @echo == Converting .pdf directly to .docx using gs. + @mkdir -p test/generated + $(gs) -sDEVICE=docxwrite -o $@ $< + + +# Valgrind test +# +valgrind: $(exe) test/generated/Python2.pdf.intermediate-mu.xml + valgrind --leak-check=full $(exe) -h -r 1 -s 0 -i test/generated/Python2.pdf.intermediate-mu.xml -o test/generated/valgrind-out.docx + @echo $@: passed + +# Memento tests. +# +ifeq ($(build),memento) +msqueeze: $(exe) test/generated/Python2.pdf.intermediate-mu.xml + MEMENTO_SQUEEZEAT=1 $(run_exe) --alloc-exp-min 0 -r 1 -s 0 -i test/generated/Python2.pdf.intermediate-mu.xml -o test/generated/msqueeze-out.docx 2>&1 | src/memento.py -q 1 -o msqueeze-raw + @echo $@: passed +mfailat: $(exe) test/generated/Python2.pdf.intermediate-mu.xml + MEMENTO_FAILAT=61463 $(run_exe) --alloc-exp-min 0 -r 1 -s 0 -i test/generated/Python2.pdf.intermediate-mu.xml -o test/generated/msqueeze-out.docx + @echo $@: passed +mutool_memento_extract = ../../build/memento-extract/mutool +msqueeze-mutool: + MEMENTO_SQUEEZEAT=1 $(mutool_memento_extract) convert -o test/generated/text_graphic_image.pdf.mutool.docx test/text_graphic_image.pdf 2>&1 | src/memento.py -q 1 -o msqueeze-raw +msqueeze-mutool2: + MEMENTO_SQUEEZEAT=1 $(mutool_memento_extract) convert -o test/generated/Python2.pdf.mutool.docx test/Python2.pdf 2>&1 | src/memento.py -q 1 -o msqueeze-raw +endif + + +# Temporary rules for generating reference files. +# +#temp_ersdr = \ +# $(patsubst %, %.intermediate-mu.xml.extract-rotate-spacing.docx.dir.ref, $(pdfs)) \ +# $(patsubst %, %.intermediate-gs.xml.extract-rotate-spacing.docx.dir.ref, $(pdfs)) \ +# +#temp: $(temp_ersdr) +#test/%.xml.extract-rotate-spacing.docx.dir.ref: test/generated/%.xml.extract-rotate-spacing.docx.dir +# @echo +# @echo copying $< to %@ +# rsync -ai $</ $@/ + + +# Buffer unit test. +# +exe_buffer_test = src/build/buffer-test-$(build).exe +exe_buffer_test_src = src/buffer.c src/buffer-test.c src/outf.c src/alloc.c src/mem.c +ifeq ($(build),memento) + exe_buffer_test_src += src/memento.c +endif +exe_buffer_test_obj = $(patsubst src/%.c, src/build/%.c-$(build).o, $(exe_buffer_test_src)) +exe_buffer_test_dep = $(exe_buffer_test_obj:.o=.d) +$(exe_buffer_test): $(exe_buffer_test_obj) + $(CC) $(flags_link) -o $@ $^ +test-buffer: $(exe_buffer_test) + @echo + @echo == Running test-buffer + mkdir -p test/generated + ./$< + @echo $@: passed +test-buffer-valgrind: $(exe_buffer_test) + @echo + @echo == Running test-buffer with valgrind + mkdir -p test/generated + valgrind --leak-check=full ./$< + @echo $@: passed + + +# Misc unit test. +# +exe_misc_test = src/build/misc-test-$(build).exe +exe_misc_test_src = \ + src/alloc.c \ + src/astring.c \ + src/buffer.c \ + src/mem.c \ + src/misc-test.c \ + src/outf.c \ + src/xml.c \ + +ifeq ($(build),memento) + exe_misc_test_src += src/memento.c +endif +exe_misc_test_obj = $(patsubst src/%.c, src/build/%.c-$(build).o, $(exe_misc_test_src)) +exe_misc_test_dep = $(exe_buffer_test_obj:.o=.d) +$(exe_misc_test): $(exe_misc_test_obj) + $(CC) $(flags_link) -o $@ $^ +test-misc: $(exe_misc_test) + @echo + @echo == Running test-misc + ./$< + @echo $@: passed + +# Source code check. +# +test-src: + @echo + @echo == Checking for use of ssize_t in source. + if grep -wn ssize_t src/*.c src/*.h include/*.h; then false; else true; fi + @echo == Checking for use of strdup in source. + if grep -wn strdup `ls -d src/*.c src/*.h|grep -v src/memento.` include; then false; else true; fi + @echo == Checking for use of bzero in source. + if grep -wn bzero src/*.c src/*.h include/*.h; then false; else true; fi + @echo Checking for variables defined inside for-loop '(...)'. + if egrep -wn 'for *[(] *[a-zA-Z0-9]+ [a-zA-Z0-9]' src/*.c src/*.h; then false; else true; fi + @echo $@: passed + +# Compile rule. We always include src/docx_template.c as a prerequisite in case +# code #includes docx_template.h. +# +src/build/%.c-$(build).o: src/%.c src/docx_template.c + @mkdir -p src/build + $(CC) -c $(flags_compile) -o $@ $< + +# Rule for machine-generated source code, src/docx_template.c. Also generates +# src/docx_template.h. +# +# These files are also in git to allow builds if python is not available. +# +src/docx_template.c: src/docx_template_build.py .ALWAYS + @echo + @echo == Building $@ + ./src/docx_template_build.py -i src/template.docx -o src/docx_template +.ALWAYS: +.PHONY: .ALWAYS + +# Tell make to preserve all intermediate files. +# +.SECONDARY: + + +# Rule for tags. +# +tags: .ALWAYS + ectags -R --extra=+fq --c-kinds=+px . + + +# Clean rule. +# +clean: + rm -r src/build test/generated src/template.docx.dir 2>/dev/null || true + +# Cleans test/generated except for intermediate files, which are slow to create +# (when using gs). +clean2: + rm -r test/generated/*.pdf.intermediate-*.xml.* 2>/dev/null || true + rm -r test/generated/*.pdf.mutool*.docx* 2>/dev/null || true + rm -r src/build 2>/dev/null || true +.PHONY: clean + + +# Include dynamic dependencies. +# +# We use $(sort ...) to remove duplicates +# +dep = $(sort $(exe_dep) $(exe_buffer_test_dep) $(exe_misc_test_dep) $(exe_ziptest_dep)) + +-include $(dep) |