Pdf2docx: خطأ ضغط -2

تم إنشاؤها على ٢٠ أكتوبر ٢٠٢٠ · 9تعليقات · مصدر: dothinking/pdf2docx

حدث خطأ compression error -2 . سيكون من الرائع أن يتمكن أي شخص من تقديم بعض المؤشرات

إرفاق ملف PDF بالمشكلة:
5_EN.pdf

رسالة خطأ:

Processing Pages: 1/28...mupdf: compression error -2
Traceback (most recent call last):
  File "/Users/erikchan/Downloads/convert.py", line 10, in <module>
    parse(pdf_files[i], docx_files[i])
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/main.py", line 31, in parse
    cv.make_docx(indexes, multi_processing)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/converter.py", line 118, in make_docx
    self._make_docx(page_indexes)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/converter.py", line 192, in _make_docx
    self.initialize(page).parse().make_page(self.doc_docx)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/converter.py", line 172, in initialize
    images, paths = self._paths_extractor.extract_paths(page)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/shape/Path.py", line 61, in extract_paths
    image = largest.to_image(page) if largest.contains_curve else None
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/shape/Path.py", line 140, in to_image
    return ImagesExtractor.clip_page(page, bbox, zoom)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/image/Image.py", line 60, in clip_page
    return cls.to_raw_dict(image, bbox)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pdf2docx/image/Image.py", line 50, in to_raw_dict
    'image': image.getPNGData()
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/fitz/fitz.py", line 5899, in getPNGData
    barray = self._getImageData(1)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/fitz/fitz.py", line 5868, in _getImageData
    return _fitz.Pixmap__getImageData(self, format)
RuntimeError: compression error -2

bug enhancement

مصدر

echan00

التعليق الأكثر فائدة

يبدو أن الصورة العائمة مع python-docx هي طلب شائع ، المستند هنا للمشاركة.

# -*- coding: utf-8 -*-

'''
Implement floating image based on python-docx.

- Text wrapping style: BEHIND TEXT <wp:anchor behindDoc="1">
- Picture position: top-left corner of PAGE `<wp:positionH relativeFrom="page">`.

Create a docx sample (Layout | Positions | More Layout Options) and explore the 
source xml (Open as a zip | word | document.xml) to implement other text wrapping
styles and position modes per `CT_Anchor._anchor_xml()`.
'''

from docx.oxml import parse_xml, register_element_cls
from docx.oxml.ns import nsdecls
from docx.oxml.shape import CT_Picture
from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne

# refer to docx.oxml.shape.CT_Inline
class CT_Anchor(BaseOxmlElement):
    """
    ``<w:anchor>`` element, container for a floating image.
    """
    extent = OneAndOnlyOne('wp:extent')
    docPr = OneAndOnlyOne('wp:docPr')
    graphic = OneAndOnlyOne('a:graphic')

    <strong i="7">@classmethod</strong>
    def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
        """
        Return a new ``<wp:anchor>`` element populated with the values passed
        as parameters.
        """
        anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
        anchor.extent.cx = cx
        anchor.extent.cy = cy
        anchor.docPr.id = shape_id
        anchor.docPr.name = 'Picture %d' % shape_id
        anchor.graphic.graphicData.uri = (
            'http://schemas.openxmlformats.org/drawingml/2006/picture'
        )
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="8">@classmethod</strong>
    def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
        """
        Return a new `wp:anchor` element containing the `pic:pic` element
        specified by the argument values.
        """
        pic_id = 0  # Word doesn't seem to use this, but does not omit it
        pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
        anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="9">@classmethod</strong>
    def _anchor_xml(cls, pos_x, pos_y):
        return (
            '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
            '           behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
            '           %s>\n'
            '  <wp:simplePos x="0" y="0"/>\n'
            '  <wp:positionH relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionH>\n'
            '  <wp:positionV relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionV>\n'                    
            '  <wp:extent cx="914400" cy="914400"/>\n'
            '  <wp:wrapNone/>\n'
            '  <wp:docPr id="666" name="unnamed"/>\n'
            '  <wp:cNvGraphicFramePr>\n'
            '    <a:graphicFrameLocks noChangeAspect="1"/>\n'
            '  </wp:cNvGraphicFramePr>\n'
            '  <a:graphic>\n'
            '    <a:graphicData uri="URI not set"/>\n'
            '  </a:graphic>\n'
            '</wp:anchor>' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) )
        )


# refer to docx.parts.story.BaseStoryPart.new_pic_inline
def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
    """Return a newly-created `w:anchor` element.

    The element contains the image specified by *image_descriptor* and is scaled
    based on the values of *width* and *height*.
    """
    rId, image = part.get_or_add_image(image_descriptor)
    cx, cy = image.scaled_dimensions(width, height)
    shape_id, filename = part.next_id, image.filename    
    return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)


# refer to docx.text.run.add_picture
def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
    """Add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page.
    """
    run = p.add_run()
    anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
    run._r.add_drawing(anchor)

# refer to docx.oxml.shape.__init__.py
register_element_cls('wp:anchor', CT_Anchor)


if __name__ == '__main__':

    from docx import Document
    from docx.shared import Inches, Pt

    document = Document()

    # add a floating image
    p = document.add_paragraph()
    add_float_picture(p, 'test.png', width=Inches(5.0), pos_x=Pt(20), pos_y=Pt(30))

    # add text
    p.add_run('Hello World'*50)


    document.save('output.docx')

dothinking في ٢٤ أكتوبر ٢٠٢٠

🎉2 👍2 🚀1

ال 9 كومينتر

شكرا لتقديم هذه القضية.

يوجد الكثير من الرسومات المتجهة ، مثل path مثل الخط والمنحنى وتركيبتهما ، في ملف pdf الخاص بك. ومع ذلك ، يتم تجاهل مسار القطع حاليًا بواسطة هذه المكتبة بسبب مشكلة فنية عند استخراج هذه المسارات من ملف pdf. بعض المسارات خارج الصفحة دون أن يتم قصها ، مما ينتج عنه مشكلة compression error -2 هذه.

إلى جانب ذلك ، هناك مشكلتان أخريان لتحويل ملف pdf هذا:

لون المسار غير صحيح. أعتقد أن السبب الأساسي هو أنه يتم حاليًا النظر في Device Color Space (Gray / RGB / CMYK) فقط ، بينما قد تتبع عينة pdf هذه مساحة لونية خاصة مثل Indexed CS ، DeviceN CS .
تتم إزالة الصور المتداخلة. python-docx لكتابة docx المحول ، لكن python-docx لا يدعم العناصر العائمة الآن. لذلك ، تتم إزالة الصور العائمة كحل وسط.

لذلك ، لسوء الحظ ، pdf2docx غير قادر على تحويل ملف pdf الخاص بك في الوقت الحالي. يجب على الأقل بذل الجهود التالية:

مقطع المسار عند استخراج المسارات من pdf
تنفيذ المزيد من مساحة اللون
إدخال الصور العائمة

dothinking في ٢١ أكتوبر ٢٠٢٠

👍1

شكرا dothinking على الشرح الواضح. أنا مندهش من أن هذه المكتبة ليست أكثر شهرة مما هي عليه. الإصدار الحالي جيد جدًا بالفعل وأعلم أن الكثير من الأشخاص يمكنهم الاستفادة منه.

يُرجى إعلامي كيف يمكنني المساعدة في حل أي من المشكلات التي أدرجتها (سأحتاج إلى بعض الإرشادات.) سواء تم حل الأخطاء أو الاختبار أو غير ذلك.

echan00 في ٢١ أكتوبر ٢٠٢٠

👍1

شكرا جزيلا @ echan00.

بعض التقدم في هذه المسألة:

[x] صورة عائمة مدعومة .
[] مسار المقطع ومساحة اللون -> أخبار جيدة أن مكتبة أخرى تنبع PyMuPDF نشرت ميزة جديدة حول استخراج المسار. سأبحث في الأمر وآمل أن أتمكن من حل هذه المشكلة.

بعد ذلك ، يتم تقدير أي اختبار أو اقتراحات.

علق على 2020-12-31: أحدث PyMuPDF 1.18.5 حل هذه المشكلة جزئيًا ، ولكن ليس تمامًا ، خاصة مسار القطع.

dothinking في ٢٤ أكتوبر ٢٠٢٠

👍1

نظرًا لأن الصورة المضمنة مدعومة بـ python-docx ، فإن خطوات استكشاف الصورة العائمة:

قم بإنشاء ملفي docx ، أحدهما يحتوي على صورة مضمنة والآخر صورة عائمة (في هذه الحالة ، الوضع behind text )
تحقق من اختلاف مصدر xml بين هذين الملفين
تنفيذ الصورة العائمة على أساس الهيكل الملحوظ والرمز للصورة المضمنة

نتائج بنية xml:

الصورة المضمنة هي عقدة <wp:inline> تحت <w:drawing>
الصورة العائمة هي عقدة <wp:anchor> تحت <w:drawing>
إلى جانب جميع العقد الفرعية للصورة المضمنة ، تحتوي الصورة العائمة أيضًا على <wp:positionH> و <wp:positionV> لتحديد الموضع الثابت

إذن ، الفكرة هي إنشاء عقدة <wp:anchor> ، ثم إلحاق العقد الفرعية:

جميع العقد متشابهة مع الصورة المضمنة
<wp:positionH> و <wp:positionV>

dothinking في ٢٤ أكتوبر ٢٠٢٠

يبدو أن الصورة العائمة مع python-docx هي طلب شائع ، المستند هنا للمشاركة.

# -*- coding: utf-8 -*-

'''
Implement floating image based on python-docx.

- Text wrapping style: BEHIND TEXT <wp:anchor behindDoc="1">
- Picture position: top-left corner of PAGE `<wp:positionH relativeFrom="page">`.

Create a docx sample (Layout | Positions | More Layout Options) and explore the 
source xml (Open as a zip | word | document.xml) to implement other text wrapping
styles and position modes per `CT_Anchor._anchor_xml()`.
'''

from docx.oxml import parse_xml, register_element_cls
from docx.oxml.ns import nsdecls
from docx.oxml.shape import CT_Picture
from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne

# refer to docx.oxml.shape.CT_Inline
class CT_Anchor(BaseOxmlElement):
    """
    ``<w:anchor>`` element, container for a floating image.
    """
    extent = OneAndOnlyOne('wp:extent')
    docPr = OneAndOnlyOne('wp:docPr')
    graphic = OneAndOnlyOne('a:graphic')

    <strong i="7">@classmethod</strong>
    def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
        """
        Return a new ``<wp:anchor>`` element populated with the values passed
        as parameters.
        """
        anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
        anchor.extent.cx = cx
        anchor.extent.cy = cy
        anchor.docPr.id = shape_id
        anchor.docPr.name = 'Picture %d' % shape_id
        anchor.graphic.graphicData.uri = (
            'http://schemas.openxmlformats.org/drawingml/2006/picture'
        )
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="8">@classmethod</strong>
    def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
        """
        Return a new `wp:anchor` element containing the `pic:pic` element
        specified by the argument values.
        """
        pic_id = 0  # Word doesn't seem to use this, but does not omit it
        pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
        anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
        anchor.graphic.graphicData._insert_pic(pic)
        return anchor

    <strong i="9">@classmethod</strong>
    def _anchor_xml(cls, pos_x, pos_y):
        return (
            '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
            '           behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
            '           %s>\n'
            '  <wp:simplePos x="0" y="0"/>\n'
            '  <wp:positionH relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionH>\n'
            '  <wp:positionV relativeFrom="page">\n'
            '    <wp:posOffset>%d</wp:posOffset>\n'
            '  </wp:positionV>\n'                    
            '  <wp:extent cx="914400" cy="914400"/>\n'
            '  <wp:wrapNone/>\n'
            '  <wp:docPr id="666" name="unnamed"/>\n'
            '  <wp:cNvGraphicFramePr>\n'
            '    <a:graphicFrameLocks noChangeAspect="1"/>\n'
            '  </wp:cNvGraphicFramePr>\n'
            '  <a:graphic>\n'
            '    <a:graphicData uri="URI not set"/>\n'
            '  </a:graphic>\n'
            '</wp:anchor>' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) )
        )


# refer to docx.parts.story.BaseStoryPart.new_pic_inline
def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
    """Return a newly-created `w:anchor` element.

    The element contains the image specified by *image_descriptor* and is scaled
    based on the values of *width* and *height*.
    """
    rId, image = part.get_or_add_image(image_descriptor)
    cx, cy = image.scaled_dimensions(width, height)
    shape_id, filename = part.next_id, image.filename    
    return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)


# refer to docx.text.run.add_picture
def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
    """Add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page.
    """
    run = p.add_run()
    anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
    run._r.add_drawing(anchor)

# refer to docx.oxml.shape.__init__.py
register_element_cls('wp:anchor', CT_Anchor)


if __name__ == '__main__':

    from docx import Document
    from docx.shared import Inches, Pt

    document = Document()

    # add a floating image
    p = document.add_paragraph()
    add_float_picture(p, 'test.png', width=Inches(5.0), pos_x=Pt(20), pos_y=Pt(30))

    # add text
    p.add_run('Hello World'*50)


    document.save('output.docx')

dothinking في ٢٤ أكتوبر ٢٠٢٠

🎉2 👍2 🚀1

لطيفة dothinking ، يبدو أنك تعرف ما هي المشكلات بالضبط. لدي مجموعة متنوعة من ملفات PDF يمكنني المساعدة في اختبارها بمجرد أن تكون جاهزًا

echan00 في ٢٦ أكتوبر ٢٠٢٠

dothinking شكرًا جزيلاً لك على نموذج التعليمات البرمجية الخاص بك! يحل مشكلتي تماما !!!!

tonysepia في ٢٤ نوفمبر ٢٠٢٠

لم أحصل على وقت لهذا المشروع لفترة طويلة. الإصدار الجديد v0.5.0 متاح الآن لحل هذه المشكلة جزئيًا:

الصورة العائمة مدعومة الآن.
يتم دعم استخراج المسار بواسطة مكتبة المنبع PyMuPDF ، ولكن ليس جيدًا للأشكال المعقدة ، مثل مسار القطع.

باستخدام هذا الإصدار الأخير ، يمكن تحويل نموذج pdf بنجاح ، ولكن لا تزال بحاجة إلى الكثير من العمل لتحسين جودة ملف docx المحول ، نظرًا للأسلوب المعقد / الرائع.

dothinking في ٣١ ديسمبر ٢٠٢٠

واو هذه ترقية عظيمة. شكرًا جزيلاً على عملك الشاق dothinking

echan00 في ١ يناير ٢٠٢١

هل كانت هذه الصفحة مفيدة؟

0 / 5 - 0 التقييمات