Stop Fighting With OCR: Convert Any Document to Markdown, HTML, or JSON Using SmolDocling
If you’ve spent hours testing OCR tools like Tesseract or Textract, only to see them mangle tables, skip headers, or flatten document structure, you’re not alone.
OCR still works great for plain text. But once your documents include multi-column layouts, code blocks, equations, or even structured lists — traditional OCR tools fall short.
What we need is something smarter. Something that doesn’t just read text, but understands the layout, semantics, and structure of a document.
And that’s where SmolDocling and the Docling library come in.
What Are SmolDocling and Docling?
- SmolDocling is a 256M-parameter Visual Language Model that reads document pages as images and outputs semantic markup called DocTags.
- Docling is a Python toolkit that takes these DocTags and converts them to structured formats.
SmolDocling replaces traditional OCR with a smarter, layout-aware approach that understands documents.
What Are DocTags?
Here’s a snippet of what SmolDocling outputs after processing a document page:
<heading level="1">Introduction</heading>
<paragraph>SmolDocling is a compact vision-language model for document understanding.</paragraph>
<table>
<row><cell>Model</cell><cell>Params</cell></row>
<row><cell>SmolDocling</cell><cell>256M</cell></row>
</table>
DocTags aren’t just about layout — they preserve semantics, reading order, and hierarchy. Which means downstream tools like Docling can do more accurate conversions than any post-hoc heuristic-based parser.
Setup Instructions (Mac M1/M2/M3/M4 Recommended)
This tutorial is tested on a MacBook M4 and uses MLX, Apple’s Metal-accelerated ML backend.
It will still work on other platforms, but you may need to modify a few model-loading lines to use PyTorch or Hugging Face directly.
Step 1: Create Python Environment
# Install uv if you haven’t yet
pip install uv
# Create a virtual environment
uv venv smoldocling-env
source smoldocling-env/bin/activate
Step 2: Install Required Packages
uv pip install gradio mlx-vlm docling-core pillow pdf2image requests
brew install poppler
Building the App
This is going to be our setup:
1. Load PDFs or Images (local or via URL)
from PIL import Image
from pathlib import Path
from urllib.parse import urlparse
from pdf2image import convert_from_path, convert_from_bytes
import requests
from io import BytesIO
def load_input_resource(input_path):
images = []
if urlparse(input_path).scheme != "":
response = requests.get(input_path, stream=True, timeout=10)
content = BytesIO(response.content)
if content.read(4) == b"%PDF":
content.seek(0)
images.extend(convert_from_bytes(content.read()))
else:
content.seek(0)
images.append(Image.open(content))
else:
path = Path(input_path)
if path.suffix.lower() == ".pdf":
images.extend(convert_from_path(str(path)))
else:
images.append(Image.open(path))
return images
2. Load the SmolDocling Model (with MLX backend)
import mlx.core as mx
from mlx_vlm import load
from mlx_vlm.utils import load_config
def load_model():
mx.set_default_device(mx.gpu)
model_path = "ds4sd/SmolDocling-256M-preview-mlx-bf16"
model, processor = load(model_path)
model.eval()
mx.eval(model.parameters())
config = load_config(model_path)
return model, processor, config
3. Process Documents
def process_document(file_obj, url_input, export_format):
"""Process a document with SmolDocling and return the results."""
try:
# Load the model
model, processor, config = load_model()
# Determine the input source
if file_obj is not None:
# Save the uploaded file to a temporary location
temp_dir = tempfile.mkdtemp()
# Get the file name from the upload
file_name = getattr(file_obj, 'name', 'uploaded_file')
# Handle different types of file objects that gradio might provide
temp_path = os.path.join(temp_dir, file_name)
# Different handling based on file object type
if hasattr(file_obj, 'read'):
# If it's a file-like object with read method
with open(temp_path, "wb") as f:
f.write(file_obj.read())
else:
# If it's already a path
if isinstance(file_obj, str):
temp_path = file_obj
else:
# For Gradio's file component that returns tuple (path, name)
temp_path = file_obj if isinstance(file_obj, str) else file_obj.name
input_path = temp_path
elif url_input.strip():
input_path = url_input.strip()
else:
return "Please provide either a file upload or a URL", None, None
# Get images from input file
images = load_input_resource(input_path)
if not images:
return "No images could be extracted from the provided file or URL", None, None
# Set up the prompt
prompt = "Convert this page to docling."
formatted_prompt = apply_chat_template(processor, config, prompt, num_images=1)
# Process each image and generate output
all_outputs = []
all_images = []
processing_log = ""
for i, image in enumerate(images):
processing_log += f"Processing page {i+1}/{len(images)}...\n\n"
processing_log += "DocTags:\n\n"
output = ""
all_images.append(image)
for token in stream_generate(
model, processor, formatted_prompt, [image], max_tokens=4096, verbose=False
):
output += token.text
if "</doctag>" in token.text:
break
all_outputs.append(output)
processing_log += output + "\n\n"
# Create DoclingDocument
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(all_outputs, all_images)
doc = DoclingDocument(name="ProcessedDocument")
doc.load_from_doctags(doctags_doc)
# Export based on selected format
if export_format == "Markdown":
result = doc.export_to_markdown()
elif export_format == "HTML":
html_output = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
html_path = Path(html_output.name)
doc.save_as_html(html_path, image_mode=ImageRefMode.EMBEDDED)
with open(html_path, "r") as f:
result = f.read()
elif export_format == "JSON":
doc_dict = doc.export_to_dict()
result = json.dumps(doc_dict, indent=4)
else:
result = "Invalid export format selected"
# Return the first image as a preview and the processing log
return result, images[0] if images else None, processing_log
except Exception as e:
import traceback
error_details = traceback.format_exc()
return f"Error processing document: {str(e)}\n\nDetails:\n{error_details}", None, error_details
4. Render output
def render_output(result, export_format):
"""Render the processed result based on export format."""
if export_format == "Markdown":
# For markdown, show the rendered markdown component.
return gr.update(value=result, visible=True), gr.update(visible=False), gr.update(visible=False)
elif export_format == "HTML":
# For HTML, render as an embedded web component.
return gr.update(visible=False), gr.update(value=result, visible=True), gr.update(visible=False)
elif export_format == "JSON":
# For JSON, parse it into an object so that gr.JSON can render it as an expandable tree.
try:
json_obj = json.loads(result)
except Exception as e:
json_obj = {"error": "Invalid JSON", "detail": str(e)}
return gr.update(visible=False), gr.update(visible=False), gr.update(value=json_obj, visible=True)
else:
# Fallback: hide all rendered views.
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
def prepare_download(result, export_format):
"""Prepare a downloadable file for the processed output."""
if export_format == "Markdown":
ext = ".md"
elif export_format == "HTML":
ext = ".html"
elif export_format == "JSON":
ext = ".json"
else:
ext = ".txt"
# Create a temporary file with the correct file type.
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
temp_file.write(result.encode("utf-8"))
temp_file.close()
# Return update objects for the download buttons.
return gr.update(value=temp_file.name), gr.update(value=temp_file.name)
5. Gradio UI for One-Click Use
# Create the Gradio interface
with gr.Blocks(title="SmolDocling Document Processing") as app:
# Add custom CSS for border styling in the output sections.
gr.HTML(
"""
<style>
#raw_output_box, #formatted_output_box {
border: 1px solid #ccc;
padding: 10px;
border-radius: 5px;
}
</style>
"""
)
lang=None
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload PDF or Image")
url_input = gr.Textbox(label="Or enter a URL to a PDF or Image")
export_format = gr.Radio(
choices=["Markdown", "HTML", "JSON"],
label="Export Format",
value="Markdown"
)
submit_button = gr.Button("Process Document", variant="primary")
if export_format == "Markdown":
lang = "markdown"
elif export_format == "HTML":
lang = "html"
elif export_format == "JSON":
lang = "json"
with gr.Column(scale=2):
with gr.Tab("Raw Output"):
with gr.Column(elem_id="raw_output_box"):
# Display the raw output in a code block.
output_text = gr.Code(label="Structured Output", language=lang, lines=20, max_lines=20)
download_raw = gr.DownloadButton("Download Raw Output")
with gr.Tab("Document Preview"):
preview_image = gr.Image(label="Document Preview", type="pil")
with gr.Tab("Log"):
# Display the log in a code block.
log_output = gr.Code(label="Processing Log", language="html", lines=20, max_lines=20)
with gr.Tab("Formatted Output"):
with gr.Column(elem_id="formatted_output_box"):
rendered_markdown = gr.Markdown(visible=False, label="Markdown Render")
rendered_html = gr.HTML(visible=False, label="HTML Render")
rendered_json = gr.JSON(visible=False, label="JSON Render")
download_formatted = gr.DownloadButton("Download Formatted Output")
# Set up event handlers with chained callbacks:
submit_button.click(
process_document,
inputs=[file_input, url_input, export_format],
outputs=[output_text, preview_image, log_output]
).then(
render_output,
inputs=[output_text, export_format],
outputs=[rendered_markdown, rendered_html, rendered_json]
).then(
prepare_download,
inputs=[output_text, export_format],
outputs=[download_raw, download_formatted]
)
if __name__ == "__main__":
app.launch()
Running the App
1. Run the app with following command.
uv run main.py
The app will run at http://127.0.0.1:7860/
2. Use any of the example images here for testing:
You can preview your converted code in the Formatted Output Tab.
Also, if you want to check the doc tags that was generated, it will will be in the Log Tab.
Complete Code
You can find the complete code at my Github repository:
https://github.com/bibekplus/SmolDocling-Document-Processor
Final Thoughts
No OCR. No layout guessing. Just clean structure, fast processing, and lightweight models that run beautifully on Apple Silicon.
If your workflow involves:
- Academic papers
- Business reports
- Form digitization
- Document-based LLM agents
Give SmolDocling a try. :-)