• Extract TIFF Image from PDF Stream

    From bellamkondaprakash@gmail.com@21:1/5 to sergey.a...@gmail.com on Thu Jun 7 03:17:23 2018
    On Friday, January 1, 2016 at 4:16:38 PM UTC+5:30, sergey.a...@gmail.com wrote:
    Python realization:

    import PyPDF2
    import struct

    """
    Links:
    PDF format: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
    CCITT Group 4: https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-T.6-198811-I!!PDF-E&type=items
    Extract images from pdf: http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
    Extract images coded with CCITTFaxDecode in .net: http://stackoverflow.com/questions/2641770/extracting-image-from-pdf-with-ccittfaxdecode-filter
    TIFF format and tags: http://www.awaresystems.be/imaging/tiff/faq.html
    """


    def tiff_header_for_CCITT(width, height, img_size, CCITT_group=4):
    tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
    return struct.pack(tiff_header_struct,
    b'II', # Byte order indication: Little indian
    42, # Version number (always 42)
    8, # Offset to first IFD
    8, # Number of tags in IFD
    256, 4, 1, width, # ImageWidth, LONG, 1, width
    257, 4, 1, height, # ImageLength, LONG, 1, lenght
    258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
    259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
    262, 3, 1, 0, # Threshholding, SHORT, 1, 0 = WhiteIsZero
    273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header
    278, 4, 1, height, # RowsPerStrip, LONG, 1, lenght
    279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image
    0 # last IFD
    )

    pdf_filename = 'scan.pdf'
    pdf_file = open(pdf_filename, 'rb')
    cond_scan_reader = PyPDF2.PdfFileReader(pdf_file)
    for i in range(0, cond_scan_reader.getNumPages()):
    page = cond_scan_reader.getPage(i)
    xObject = page['/Resources']['/XObject'].getObject()
    for obj in xObject:
    if xObject[obj]['/Subtype'] == '/Image':
    """
    The CCITTFaxDecode filter decodes image data that has been encoded using
    either Group 3 or Group 4 CCITT facsimile (fax) encoding. CCITT encoding is
    designed to achieve efficient compression of monochrome (1 bit per pixel) image
    data at relatively low resolutions, and so is useful only for bitmap image data, not
    for color images, grayscale images, or general data.

    K < 0 --- Pure two-dimensional encoding (Group 4)
    K = 0 --- Pure one-dimensional encoding (Group 3, 1-D)
    K > 0 --- Mixed one- and two-dimensional encoding (Group 3, 2-D)
    """
    if xObject[obj]['/Filter'] == '/CCITTFaxDecode':
    if xObject[obj]['/DecodeParms']['/K'] == -1:
    CCITT_group = 4
    else:
    CCITT_group = 3
    width = xObject[obj]['/Width']
    height = xObject[obj]['/Height']
    data = xObject[obj]._data # sorry, getData() does not work for CCITTFaxDecode
    img_size = len(data)
    tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group)
    img_name = obj[1:] + '.tiff'
    with open(img_name, 'wb') as img_file:
    img_file.write(tiff_header + data)
    #
    # import io
    # from PIL import Image
    # im = Image.open(io.BytesIO(tiff_header + data)) pdf_file.close()

    this is not working but how people are getting the data from the pdf i am not get could you let me know and my output is
    '/Contents': {'/Length': '33'},
    '/CropBox': ['0', '0', '614.4', '792'],
    '/MediaBox': ['0', '0', '614.4', '792'],
    '/Parent': {'/Count': '2',
    '/Kids': [{...},
    {'/Contents': {'/Length': '33'},
    '/CropBox': ['0', '0', '614.4', '822'],
    '/MediaBox': ['0', '0', '614.4', '822'],
    '/Parent': {...},
    '/Resources': {'/ProcSet': ['/PDF', '/Text', '/ImageC'],
    '/XObject': {'/Im1': {'/BitsPerComponent': '1',
    '/ColorSpace': '/DeviceGray',
    '/DecodeParms': [{'/BlackIs1': 'false',
    '/Columns': '2560',
    '/K': '-1',
    '/Rows': '3425'}],
    '/Filter': ['/CCITTFaxDecode'],
    '/Height': '3425',
    '/Length': '30572',
    '/Name': '/Im1',
    '/Subtype': '/Image',
    '/Type': '/XObject',
    '/Width': '2560'}}},
    '/Thumb': {'/BitsPerComponent': '1',
    '/ColorSpace': '/DeviceGray',
    '/DecodeParms': [{'/BlackIs1': 'false',
    '/Columns': '79',
    '/K': '-1',
    '/Rows': '106'}],
    '/Filter': ['/CCITTFaxDecode'],
    '/Height': '106',
    '/Length': '463',
    '/Width': '79'},
    '/Type': '/Page'}],
    '/Type': '/Pages'},
    '/Resources': {'/ProcSet': ['/PDF', '/Text', '/ImageC'],
    '/XObject': {'/Im0': {'/BitsPerComponent': '1',
    '/ColorSpace': '/DeviceGray',
    '/DecodeParms': [{'/BlackIs1': 'false',
    '/Columns': '2560',
    '/K': '-1',
    '/Rows': '3300'}],
    '/Filter': ['/CCITTFaxDecode'],
    '/Height': '3300',
    '/Length': '45897',
    '/Name': '/Im0',
    '/Subtype': '/Image',
    '/Type': '/XObject',
    '/Width': '2560'}}},
    '/Thumb': {'/BitsPerComponent': '1',
    '/ColorSpace': '/DeviceGray',
    '/DecodeParms': [{'/BlackIs1': 'false',
    '/Columns': '82',
    '/K': '-1',
    '/Rows': '106'}],
    '/Filter': ['/CCITTFaxDecode'],
    '/Height': '106',
    '/Length': '726',
    '/Width': '82'},
    '/Type': '/Page'}
    how to get data from this

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From darbydaystaffing2125@gmail.com@21:1/5 to All on Fri Dec 6 06:48:47 2019
    Thank you so much! Works well. For PDFs that have /BlackIs 1, just change the struct setting for thresh-hold, from 0 to 1.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)