from pyPdf import PdfFileWriter, PdfFileReader import os import re import tempfile def Combine(inputs, output): """Combine PDF files. Inputs: a list of PDF input file names. Output: an output stream. Reads the input files, combines them in order and writes the result to the output stream. """ outpdf = PdfFileWriter() # Note that we need to keep all the input files open until the writing has # finished. infiles = [file(inname, "rb") for inname in inputs] for infile in infiles: inpdf = PdfFileReader(infile) print 'From %s, %s pages' % (infile.name, inpdf.getNumPages()) for page in inpdf.pages: outpdf.addPage(page) outpdf.write(output) for infile in infiles: infile.close() def CombineSafely(inputs, output): """Combine PDF files safely. Inputs: a list of PDF input file names. Output: output file name. Reads the input files, combines them in order, writes the result to a temporary file, and then renames the temporary file to the output name. This allows the output to be the same as one of the inputs. """ tempout = tempfile.NamedTemporaryFile(delete = False) tempname = tempout.name Combine(inputs, tempout) tempout.close() # Windows version of os.rename can't replace an existing file. try: os.remove(output) except: print 'Failed to delete %s' % output os.rename(tempname, output) def FindCandidates(filelist): """Search file list for combining candidates. Filelist: list of file names. Looks for files called X such that there are also files called X_0001 etc. Returns a dict X -> [X_0001, ...] """ result = {} filelist.sort basename = '' baseroot = '' matcher = re.compile('^(.+)_\d\d\d\d.pdf$', re.I) for name in filelist: match = matcher.match(name) if match: if match.group(1) == baseroot: result.setdefault(basename, []).append(name) else: basename = name baseroot = os.path.splitext(name)[0] return result def FindAllCandidates(dirname): """Search directory tree for combining candidates. Dirname: directory name. Returns a list of [(dirname, list of files to combine)] """ os.walk(dirname) result = [] for (root, dirs, files) in os.walk(dirname): candidates = FindCandidates(files) if candidates: for (k, v) in candidates.items(): v.append(k) result.append((root, v)) return result def main(): start = os.getcwd() print 'Starting in %s' % start candidates = FindAllCandidates(start) if not candidates: print 'Nothing to do' return for (dir, files) in candidates: print 'In %s:\n %s\n' % (dir, '\n '.join(files)) ok = raw_input('Continue (y/n)? ') if ok == 'y' or ok == 'Y': for (dir, files) in candidates: CombineSafely([os.path.join(dir, f) for f in files], os.path.join(dir, files[-1])) # Delete all of the original files except the output file for f in files[:-1]: os.remove(os.path.join(dir, f)) if __name__ == "__main__": main()