from docling.document_converter import DocumentConverterconverter = DocumentConverter()result = converter.convert("document.pdf")# Export to Markdown stringmarkdown = result.document.export_to_markdown()print(markdown)# Save to fileresult.document.save_as_markdown("output.md")
from docling.document_converter import DocumentConverterconverter = DocumentConverter()result = converter.convert("document.pdf")# Standard Markdown with structuremarkdown = result.document.export_to_markdown()print(markdown)
Example output:
# Document Title## Section 1This is a paragraph with **bold** and *italic* text.### Subsection 1.1- Bullet point 1- Bullet point 2| Header 1 | Header 2 ||----------|----------|| Cell 1 | Cell 2 |
from docling_core.types.doc import ImageRefMode# Embed images as base64 data URLshtml = result.document.export_to_html( image_mode=ImageRefMode.EMBEDDED)# Save to fileresult.document.save_as_html( "output.html", image_mode=ImageRefMode.EMBEDDED)
Images are embedded directly in HTML, creating a standalone file.
import jsonfrom docling.document_converter import DocumentConverterconverter = DocumentConverter()result = converter.convert("document.pdf")# Export to dictdata = result.document.export_to_dict()# Pretty-print JSONprint(json.dumps(data, indent=2))# Save to filewith open("output.json", "w") as f: json.dump(data, f, indent=2)# Or use helperresult.document.save_as_json("output.json")
from docling_core.types.doc import ImageRefMode# Embed images as base64 in JSONdata = result.document.export_to_dict( image_mode=ImageRefMode.EMBEDDED)# Or use placeholdersdata = result.document.export_to_dict( image_mode=ImageRefMode.PLACEHOLDER)result.document.save_as_json( "output.json", image_mode=ImageRefMode.EMBEDDED)
DocTags is a structured text format designed for NLP pipelines:
from docling.document_converter import DocumentConverterconverter = DocumentConverter()result = converter.convert("document.pdf")# Export to DocTagsdoctags = result.document.export_to_doctags()print(doctags)# Save to fileresult.document.save_as_doctags("output.doctags.txt")
Example output:
<title>Document Title</title><section-header>Section 1</section-header><paragraph>This is a paragraph with bold and italic text.</paragraph><subsection-header>Subsection 1.1</subsection-header><list-item>Bullet point 1</list-item><list-item>Bullet point 2</list-item><table> <row> <cell>Header 1</cell> <cell>Header 2</cell> </row> <row> <cell>Cell 1</cell> <cell>Cell 2</cell> </row></table>
import yamlfrom docling.document_converter import DocumentConverterconverter = DocumentConverter()result = converter.convert("document.pdf")# Export to dict, then to YAMLdata = result.document.export_to_dict()yaml_str = yaml.safe_dump(data, default_flow_style=False)print(yaml_str)# Save to filewith open("output.yaml", "w") as f: yaml.safe_dump(data, f, default_flow_style=False)
Example output:
schema_name: DoclingDocumentversion: 1.0.0name: document.pdfmetadata: pages: 10 format: PDFpages: - page_no: 1 size: width: 612.0 height: 792.0body: - self_ref: '#/texts/1' type: paragraph text: This is a paragraph.