Pdf2docx: 압축 오류 -2

에 만든 2020년 10월 20일 · 9코멘트 · 출처: dothinking/pdf2docx

compression error -2 오류가 발생했습니다. 누구든지 포인터를 제공 할 수 있다면 좋을 것입니다.

문제가 있는 PDF를 첨부했습니다.
5_KO.pdf

에러 메시지:

Processing Pages: 1/28...mupdf: compression error -2
Traceback (most recent call last):
  File "/Users/erikchan/Downloads/convert.py", line 10, in <module>
    parse(pdf_files[i], docx_files[i])
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/main.py", line 31, in parse
    cv.make_docx(indexes, multi_processing)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/converter.py", line 118, in make_docx
    self._make_docx(page_indexes)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/converter.py", line 192, in _make_docx
    self.initialize(page).parse().make_page(self.doc_docx)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/converter.py", line 172, in initialize
    images, paths = self._paths_extractor.extract_paths(page)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/shape/Path.py", line 61, in extract_paths
    image = largest.to_image(page) if largest.contains_curve else None
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/shape/Path.py", line 140, in to_image
    return ImagesExtractor.clip_page(page, bbox, zoom)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/image/Image.py", line 60, in clip_page
    return cls.to_raw_dict(image, bbox)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/image/Image.py", line 50, in to_raw_dict
    'image': image.getPNGData()
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/fitz/fitz.py", line 5899, in getPNGData
    barray = self._getImageData(1)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/fitz/fitz.py", line 5868, in _getImageData
    return _fitz.Pixmap__getImageData(self, format)
RuntimeError: compression error -2

bug enhancement

출처

echan00

가장 유용한 댓글

python-docx 가 있는 떠 있는 그림이 일반적인 요청인 것 같습니다. 공유를 위해 여기 문서를 참조하세요.

# -*- coding: utf-8 -*-

'''
Implement floating image based on python-docx.

- Text wrapping style: BEHIND TEXT <wp:anchor behindDoc="1">
- Picture position: top-left corner of PAGE `<wp:positionH relativeFrom="page">`.

Create a docx sample (Layout | Positions | More Layout Options) and explore the 
source xml (Open as a zip | word | document.xml) to implement other text wrapping
styles and position modes per `CT_Anchor._anchor_xml()`.
'''

from docx.oxml import parse_xml, register_element_cls
from docx.oxml.ns import nsdecls
from docx.oxml.shape import CT_Picture
from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne

# refer to docx.oxml.shape.CT_Inline
class CT_Anchor(BaseOxmlElement):
    """
    ``<w:anchor>`` element, container for a floating image.
    """
    extent = OneAndOnlyOne('wp:extent')
    docPr = OneAndOnlyOne('wp:docPr')
    graphic = OneAndOnlyOne('a:graphic')

    <strong i="7">@classmethod</strong>
    def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
        """
        Return a new ``<wp:anchor>`` element populated with the values passed
        as parameters.
        """
        anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
        anchor.extent.cx = cx
        anchor.extent.cy = cy
        anchor.docPr.id = shape_id
        anchor.docPr.name = 'Picture %d' % shape_id
        anchor.graphic.graphicData.uri = (
            'http://schemas.openxmlformats.org/drawingml/2006/picture'
        )
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="8">@classmethod</strong>
    def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
        """
        Return a new `wp:anchor` element containing the `pic:pic` element
        specified by the argument values.
        """
        pic_id = 0  # Word doesn't seem to use this, but does not omit it
        pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
        anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="9">@classmethod</strong>
    def _anchor_xml(cls, pos_x, pos_y):
        return (
            '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
            '           behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
            '           %s>\n'
            '  <wp:simplePos x="0" y="0"/>\n'
            '  <wp:positionH relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionH>\n'
            '  <wp:positionV relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionV>\n'                    
            '  <wp:extent cx="914400" cy="914400"/>\n'
            '  <wp:wrapNone/>\n'
            '  <wp:docPr id="666" name="unnamed"/>\n'
            '  <wp:cNvGraphicFramePr>\n'
            '    <a:graphicFrameLocks noChangeAspect="1"/>\n'
            '  </wp:cNvGraphicFramePr>\n'
            '  <a:graphic>\n'
            '    <a:graphicData uri="URI not set"/>\n'
            '  </a:graphic>\n'
            '</wp:anchor>' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) )
        )


# refer to docx.parts.story.BaseStoryPart.new_pic_inline
def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
    """Return a newly-created `w:anchor` element.

    The element contains the image specified by *image_descriptor* and is scaled
    based on the values of *width* and *height*.
    """
    rId, image = part.get_or_add_image(image_descriptor)
    cx, cy = image.scaled_dimensions(width, height)
    shape_id, filename = part.next_id, image.filename    
    return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)


# refer to docx.text.run.add_picture
def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
    """Add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page.
    """
    run = p.add_run()
    anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
    run._r.add_drawing(anchor)

# refer to docx.oxml.shape.__init__.py
register_element_cls('wp:anchor', CT_Anchor)


if __name__ == '__main__':

    from docx import Document
    from docx.shared import Inches, Pt

    document = Document()

    # add a floating image
    p = document.add_paragraph()
    add_float_picture(p, 'test.png', width=Inches(5.0), pos_x=Pt(20), pos_y=Pt(30))

    # add text
    p.add_run('Hello World'*50)


    document.save('output.docx')

dothinking 에 2020년 10월 24일

🎉2 👍2 🚀1

모든 9 댓글

이 케이스를 제공해 주셔서 감사합니다.

많은 벡터 그래픽, 즉 선, 곡선 및 이들의 조합과 같은 path pdf에 존재합니다. 그러나 현재 클리핑 경로는 pdf에서 이러한 경로를 추출할 때 기술적인 문제로 인해 이 라이브러리에서 무시됩니다. 일부 경로가 잘리지 않고 페이지 외부에 있어 compression error -2 문제가 발생합니다.

게다가 이 pdf를 변환하는 데 두 가지 문제가 더 있습니다.

경로 색상이 잘못되었습니다. 근본 원인은 현재 Device Color Space (Gray/RGB/CMYK)만 고려되는 반면 이 pdf 샘플은 Indexed CS , DeviceN CS 와 같은 특수 색상 공간을 따를 수 있다는 것입니다.
겹친 이미지가 제거됩니다. python-docx 는 변환된 docx를 작성하는 데 적용되지만 python-docx 는 현재 부동 요소를 지원하지 않습니다. 따라서 부동 이미지는 타협으로 제거됩니다.

따라서 불행히도 pdf2docx 은(는) 현재 pdf를 변환할 수 없습니다. 최소한 다음과 같은 노력을 기울여야 합니다.

pdf에서 경로를 추출할 때 클립 경로
더 많은 색 공간 구현
떠 다니는 이미지 소개

dothinking 에 2020년 10월 21일

👍1

명확한 설명에 대해 @dothinking 에게 감사드립니다. 나는 이 도서관이 지금보다 더 유명하지 않다는 것에 놀랐다. 현재 버전은 이미 매우 훌륭하고 많은 사람들이 혜택을 받을 수 있다는 것을 알고 있습니다.

귀하가 나열한 문제를 해결하는 데 도움이 될 수 있는 방법을 알려주세요.

echan00 에 2020년 10월 21일

👍1

@echan00 감사합니다.

이 문제에 대한 몇 가지 진행 상황:

[x] 플로팅 이미지가 지원됩니다 .
[ ] 클립 경로 및 색 공간 -> 다른 업스트림 라이브러리 PyMuPDF 에서 경로 추출에 대한 새로운 기능을 게시했습니다. 조사해 보고 이 문제를 해결할 수 있기를 바랍니다.

그 후에는 어떤 테스트나 제안도 환영합니다.

2020-12-31에 대한 댓글: 최신 PyMuPDF 1.18.5는 이 문제를 부분적으로 해결했지만 완벽하지는 않습니다. 특히 클리핑 경로가 그렇습니다.

dothinking 에 2020년 10월 24일

👍1

인라인 이미지는 python-docx 에서 지원되므로 플로팅 이미지를 탐색하는 단계는 다음과 같습니다.

두 개의 docx 파일을 만듭니다. 하나는 인라인 이미지이고 다른 하나는 부동 이미지입니다(이 경우 behind text 모드).
이 두 파일 간의 소스 xml 차이를 확인하십시오.
관찰된 구조와 인라인 이미지에 대한 코드를 기반으로 플로팅 이미지 구현

xml 구조 결과:

인라인 이미지는 <w:drawing> 아래의 <wp:inline> 노드입니다.
플로팅 이미지는 <w:drawing> 아래의 <wp:anchor> 노드입니다.
인라인 이미지의 모든 하위 노드 외에도 부동 이미지에는 고정 위치를 정의하는 <wp:positionH> 및 <wp:positionV> 됩니다.

따라서 아이디어는 <wp:anchor> 노드를 만든 다음 하위 노드를 추가하는 것입니다.

인라인 이미지와 동일한 모든 노드
<wp:positionH> 및 <wp:positionV>

dothinking 에 2020년 10월 24일

python-docx 가 있는 떠 있는 그림이 일반적인 요청인 것 같습니다. 공유를 위해 여기 문서를 참조하세요.

# -*- coding: utf-8 -*-

'''
Implement floating image based on python-docx.

- Text wrapping style: BEHIND TEXT <wp:anchor behindDoc="1">
- Picture position: top-left corner of PAGE `<wp:positionH relativeFrom="page">`.

Create a docx sample (Layout | Positions | More Layout Options) and explore the 
source xml (Open as a zip | word | document.xml) to implement other text wrapping
styles and position modes per `CT_Anchor._anchor_xml()`.
'''

from docx.oxml import parse_xml, register_element_cls
from docx.oxml.ns import nsdecls
from docx.oxml.shape import CT_Picture
from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne

# refer to docx.oxml.shape.CT_Inline
class CT_Anchor(BaseOxmlElement):
    """
    ``<w:anchor>`` element, container for a floating image.
    """
    extent = OneAndOnlyOne('wp:extent')
    docPr = OneAndOnlyOne('wp:docPr')
    graphic = OneAndOnlyOne('a:graphic')

    <strong i="7">@classmethod</strong>
    def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
        """
        Return a new ``<wp:anchor>`` element populated with the values passed
        as parameters.
        """
        anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
        anchor.extent.cx = cx
        anchor.extent.cy = cy
        anchor.docPr.id = shape_id
        anchor.docPr.name = 'Picture %d' % shape_id
        anchor.graphic.graphicData.uri = (
            'http://schemas.openxmlformats.org/drawingml/2006/picture'
        )
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="8">@classmethod</strong>
    def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
        """
        Return a new `wp:anchor` element containing the `pic:pic` element
        specified by the argument values.
        """
        pic_id = 0  # Word doesn't seem to use this, but does not omit it
        pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
        anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="9">@classmethod</strong>
    def _anchor_xml(cls, pos_x, pos_y):
        return (
            '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
            '           behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
            '           %s>\n'
            '  <wp:simplePos x="0" y="0"/>\n'
            '  <wp:positionH relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionH>\n'
            '  <wp:positionV relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionV>\n'                    
            '  <wp:extent cx="914400" cy="914400"/>\n'
            '  <wp:wrapNone/>\n'
            '  <wp:docPr id="666" name="unnamed"/>\n'
            '  <wp:cNvGraphicFramePr>\n'
            '    <a:graphicFrameLocks noChangeAspect="1"/>\n'
            '  </wp:cNvGraphicFramePr>\n'
            '  <a:graphic>\n'
            '    <a:graphicData uri="URI not set"/>\n'
            '  </a:graphic>\n'
            '</wp:anchor>' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) )
        )


# refer to docx.parts.story.BaseStoryPart.new_pic_inline
def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
    """Return a newly-created `w:anchor` element.

    The element contains the image specified by *image_descriptor* and is scaled
    based on the values of *width* and *height*.
    """
    rId, image = part.get_or_add_image(image_descriptor)
    cx, cy = image.scaled_dimensions(width, height)
    shape_id, filename = part.next_id, image.filename    
    return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)


# refer to docx.text.run.add_picture
def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
    """Add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page.
    """
    run = p.add_run()
    anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
    run._r.add_drawing(anchor)

# refer to docx.oxml.shape.__init__.py
register_element_cls('wp:anchor', CT_Anchor)


if __name__ == '__main__':

    from docx import Document
    from docx.shared import Inches, Pt

    document = Document()

    # add a floating image
    p = document.add_paragraph()
    add_float_picture(p, 'test.png', width=Inches(5.0), pos_x=Pt(20), pos_y=Pt(30))

    # add text
    p.add_run('Hello World'*50)


    document.save('output.docx')

dothinking 에 2020년 10월 24일

🎉2 👍2 🚀1

좋은 @dothinking , 문제가 정확히 무엇인지 알고 있는 것 같습니다. 다양한 PDF가 있습니다. 준비가 되면 테스트를 도와드릴 수 있습니다.

echan00 에 2020년 10월 26일

@dothinking 귀하의 코드 샘플에 대해

tonysepia 에 2020년 11월 24일

이 프로젝트에 너무 오랫동안 시간을 할애하지 못했습니다. 이제 이 문제를 부분적으로 해결할 수 있는 새 버전 v0.5.0 사용할 수 있습니다.

플로팅 이미지가 지원됩니다.
경로 추출은 업스트림 라이브러리 PyMuPDF 에서 지원되지만 클리핑 경로와 같은 복잡한 모양에는 적합하지 않습니다.

이 최신 버전을 사용하면 샘플 pdf를 성공적으로 변환할 수 있지만 복잡하고 화려한 스타일로 인해 변환된 docx 파일의 품질을 높이려면 여전히 많은 작업이 필요합니다.

dothinking 에 2020년 12월 31일

와우 이것은 훌륭한 업그레이드입니다. @dothinking의 노고에 진심으로 감사드립니다.

echan00 에 2021년 01월 01일

이 페이지가 도움이 되었나요?

0 / 5 - 0 등급

Pdf2docx: 압축 오류 -2

가장 유용한 댓글

모든 9 댓글

관련 문제