In this sample we will use the Cloudmersive Document AI API to intelligently extract all fields froman input document and save the results to JSON in the same folder.
Prior to running the script, install the SDK:
pip install cloudmersive-documentai-api-client requests
Then configure the variables INPUT_FILE_PATH, API_BASE_PATH, and API_KEY appropriately.
"""
Cloudmersive Document AI - Advanced Extract All Fields
- Prints results to console
- Saves JSON output alongside the input file (same name, .json)
- Shows status/spinner while waiting
Install:
pip install cloudmersive-documentai-api-client
"""
import base64
import json
import os
import sys
import threading
import time
from pprint import pprint
import cloudmersive_documentai_api_client
from cloudmersive_documentai_api_client.rest import ApiException
# =========================
# USER CONFIG (edit these)
# =========================
# Example: "https://api.cloudmersive.com" (default Cloudmersive public cloud)
# Or your Private Cloud endpoint base URL (e.g., "https://your-privatecloud.example.com")
API_BASE_PATH = "https://api.cloudmersive.com"
# Your Cloudmersive API key
API_KEY = "YOUR-APPI-KEY"
# Full local path to the document you want to process
INPUT_FILE_PATH = r"C:\Users\input.pdf"
# Optional parameters
RECOGNITION_MODE = "Advanced" # "Advanced" (default/high accuracy) or "Normal"
PREPROCESSING = "Auto" # "Auto" (default), "Paged", or "Compatability"
def _spinner(stop_event: threading.Event, prefix: str = "Processing"):
frames = ["|", "/", "-", "\\"]
i = 0
while not stop_event.is_set():
sys.stdout.write(f"\r{prefix}... {frames[i % len(frames)]}")
sys.stdout.flush()
i += 1
time.sleep(0.15)
sys.stdout.write("\r" + " " * 60 + "\r")
sys.stdout.flush()
def main():
if not os.path.isfile(INPUT_FILE_PATH):
raise FileNotFoundError(f"Input file not found: {INPUT_FILE_PATH}")
output_json_path = os.path.splitext(INPUT_FILE_PATH)[0] + ".json"
print(f"Input file: {INPUT_FILE_PATH}")
print(f"Output JSON: {output_json_path}")
print(f"API base: {API_BASE_PATH}")
print(f"Mode: {RECOGNITION_MODE}")
print(f"Preprocess: {PREPROCESSING}")
print("Preparing request...")
configuration = cloudmersive_documentai_api_client.Configuration()
configuration.api_key["Apikey"] = API_KEY
configuration.host = API_BASE_PATH
api_client = cloudmersive_documentai_api_client.ApiClient(configuration)
api_instance = cloudmersive_documentai_api_client.ExtractApi(api_client)
stop_event = threading.Event()
spinner_thread = threading.Thread(
target=_spinner,
args=(stop_event, "Waiting for Document AI result"),
daemon=True
)
result_holder = {"resp": None, "err": None}
def _worker():
try:
# IMPORTANT: pass the FILE PATH (string), not an open file handle
resp = api_instance.extract_all_fields_and_tables(
recognition_mode=RECOGNITION_MODE,
preprocessing=PREPROCESSING,
input_file=INPUT_FILE_PATH
)
result_holder["resp"] = resp
except Exception as e:
result_holder["err"] = e
worker_thread = threading.Thread(target=_worker, daemon=True)
print("Calling Document AI (Extract All Fields and Tables)...")
spinner_thread.start()
worker_thread.start()
worker_thread.join()
stop_event.set()
spinner_thread.join()
if result_holder["err"] is not None:
err = result_holder["err"]
if isinstance(err, ApiException):
print("\nAPI Exception:")
print(f"Status: {err.status}")
print(f"Reason: {err.reason}")
print(f"Body:\n{err.body}")
else:
print("\nError:")
print(repr(err))
sys.exit(1)
resp = result_holder["resp"]
resp_json_obj = api_client.sanitize_for_serialization(resp)
print("\n=== Document AI Result (JSON) ===")
print(json.dumps(resp_json_obj, indent=2, ensure_ascii=False))
with open(output_json_path, "w", encoding="utf-8") as f:
json.dump(resp_json_obj, f, indent=2, ensure_ascii=False)
print(f"\nSaved JSON to: {output_json_path}")
if __name__ == "__main__":
main()