%pip install deep-translator transformers sentencepiece sacremoses anthropic -q

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/42.3 kB ? eta -:--:--
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.3/42.3 kB 966.2 kB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 897.5/897.5 kB 15.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 455.2/455.2 kB 14.6 MB/s eta 0:00:00

sample_text = """
Noch bei der höchstvollendeten Reproduktion fällt eines aus: das Hier
und Jetzt des Kunstwerks – sein einmaliges Dasein an dem Orte, an dem
es sich befindet. An diesem einmaligen Dasein aber und an nichts sonst
vollzog sich die Geschichte, der es im Laufe seines Bestehens
unterworfen gewesen ist.
""".strip()

# Change these to match your source and target languages.
source_lang = "de"  # ISO 639-1 code (e.g., "de", "fr", "es", "zh", "ar", "ja")
target_lang = "en"

print(f"Source language: {source_lang}")
print(f"Target language: {target_lang}")
print(f"\nText ({len(sample_text)} characters):\n")
print(sample_text)

Source language: de
Target language: en

Text (297 characters):

Noch bei der höchstvollendeten Reproduktion fällt eines aus: das Hier
und Jetzt des Kunstwerks – sein einmaliges Dasein an dem Orte, an dem
es sich befindet. An diesem einmaligen Dasein aber und an nichts sonst
vollzog sich die Geschichte, der es im Laufe seines Bestehens
unterworfen gewesen ist.

from deep_translator import GoogleTranslator

def translate_google(text, source="de", target="en"):
    """
    Translate text using Google Translate (free, no API key required).
    Handles the 5000-character limit by chunking automatically.
    """
    translator = GoogleTranslator(source=source, target=target)

    # Google Translate has a 5000-char limit per request.
    # For longer texts, we split on paragraph boundaries.
    if len(text) <= 5000:
        return translator.translate(text)

    paragraphs = text.split("\n\n")
    translated_paragraphs = []
    for para in paragraphs:
        if para.strip():
            translated_paragraphs.append(translator.translate(para.strip()))
    return "\n\n".join(translated_paragraphs)


google_result = translate_google(sample_text, source=source_lang, target=target_lang)

print("=" * 60)
print("GOOGLE TRANSLATE RESULT")
print("=" * 60)
print(google_result)

============================================================
GOOGLE TRANSLATE RESULT
============================================================
Even with the most perfect reproduction, one thing is missing: this
and now of the work of art - its unique existence in the place where
it is located. But in this unique existence and nothing else
the history of it took place over the course of its existence
has been subjected.

from deep_translator import GoogleTranslator, MyMemoryTranslator

# MyMemory uses locale-style codes (e.g. "de-DE") rather than bare ISO codes.
# This mapping covers common research languages; add yours if needed.
MYMEMORY_CODES = {
    "de": "de-DE", "en": "en-GB", "fr": "fr-FR", "es": "es-ES",
    "it": "it-IT", "pt": "pt-PT", "nl": "nl-NL", "ru": "ru-RU",
    "zh": "zh-CN", "ja": "ja-JP", "ko": "ko-KR", "ar": "ar-SA",
    "he": "he-IL", "el": "el-GR", "la": "la-VA", "pl": "pl-PL",
    "tr": "tr-TR", "uk": "uk-UA", "cs": "cs-CZ", "sv": "sv-SE",
}
mm_source = MYMEMORY_CODES.get(source_lang, source_lang)
mm_target = MYMEMORY_CODES.get(target_lang, target_lang)

engines = {
    "Google Translate": GoogleTranslator(source=source_lang, target=target_lang),
    "MyMemory":        MyMemoryTranslator(source=mm_source, target=mm_target),
}

# Optional: add DeepL if you have a free API key (https://www.deepl.com/pro-api)
# from deep_translator import DeeplTranslator
# engines["DeepL"] = DeeplTranslator(
#     source=source_lang, target=target_lang,
#     api_key="YOUR_DEEPL_KEY"
# )

print("Comparing translation engines on the same passage:\n")
for name, engine in engines.items():
    try:
        result = engine.translate(sample_text)
        print(f"--- {name} ---")
        print(result)
        print()
    except Exception as e:
        print(f"--- {name} --- ERROR: {e}\n")

Comparing translation engines on the same passage:

--- Google Translate ---
Even with the most perfect reproduction, one thing is missing: this
and now of the work of art - its unique existence in the place where
it is located. But in this unique existence and nothing else
the history of it took place over the course of its existence
has been subjected.

--- MyMemory ---
Even with the most perfect reproduction, one thing is missing: the here
and now of the work of art – its unique existence in the place where
but in this unique existence and in nothing else
the story unfolded, which in the course of its existence
has been subjected to.

from transformers import MarianMTModel, MarianTokenizer

def translate_marian(text, source="de", target="en"):
    """
    Translate using Helsinki-NLP's MarianMT models (runs locally).
    The model is downloaded once and cached.
    """
    model_name = f"Helsinki-NLP/opus-mt-{source}-{target}"
    print(f"Loading model: {model_name}")

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # MarianMT works best on sentence-level input.
    # We split on sentence boundaries and translate each one.
    sentences = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
    # Add periods back
    sentences = [s + "." if not s.endswith(".") else s for s in sentences]

    translated_sentences = []
    for sent in sentences:
        inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True)
        translated_ids = model.generate(**inputs)
        result = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        translated_sentences.append(result)

    return " ".join(translated_sentences)


marian_result = translate_marian(sample_text, source=source_lang, target=target_lang)

print("\n" + "=" * 60)
print("MARIANMT RESULT")
print("=" * 60)
print(marian_result)

Loading model: Helsinki-NLP/opus-mt-de-en

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

The tied weights mapping and config for this model specifies to tie model.shared.weight to model.decoder.embed_tokens.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
The tied weights mapping and config for this model specifies to tie model.shared.weight to model.encoder.embed_tokens.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning

from huggingface_hub import list_models

def find_translation_models(source="de", target="en"):
    """Search Hugging Face for available translation models for a language pair."""
    search_term = f"opus-mt-{source}-{target}"
    models = list(list_models(search=search_term, author="Helsinki-NLP"))
    if models:
        print(f"Found {len(models)} model(s) for {source} → {target}:")
        for m in models:
            print(f"  • {m.id}")
    else:
        # Sometimes models use language group codes (e.g., "roa" for Romance)
        print(f"No exact model for {source} → {target}.")
        print(f"Try browsing: https://huggingface.co/Helsinki-NLP")
    return models

# Example: check what's available for your pair
_ = find_translation_models(source_lang, target_lang)

Found 59 model(s) for de → en:
  • Helsinki-NLP/opus-mt-de-en
  • Helsinki-NLP/opus-mt-en-de
  • Helsinki-NLP/opus-mt-tc-bible-big-ine-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-bnt-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-afa-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-afa-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-alv-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-bat-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-bat-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-cel-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-aav
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-afa
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-bnt
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-fiu
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-gem
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-gmq
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-gmw
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-iir
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-inc
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-ine
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-itc
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mkh
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-pqw
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-sem
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-sla
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-trk
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-urj
  • Helsinki-NLP/opus-mt-tc-bible-big-dra-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-fiu-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-gem-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-gmq-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-gmw-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-gmw-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-iir-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-inc-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-inc-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-ine-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-ira-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-itc-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-itc-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-mkh-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-phi-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-poz-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-pqe-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-pqw-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-sem-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-sem-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-sla-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-tai-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-trk-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-urj-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-urj-deu_eng_nld
  • Helsinki-NLP/opus-mt-tc-bible-big-zhx-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa
  • Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul

import getpass
import os
from google.colab import userdata

# Get the value of your secret and store it in a Python variable

# This prompts you to paste your key without it being visible or saved.
if "ANTHROPIC_API_KEY" not in os.environ:
    os.environ["ANTHROPIC_API_KEY"] = userdata.get('ANTHROPIC_API_KEY')

import anthropic

client = anthropic.Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))

LANG_NAMES = {
    "de": "German", "fr": "French", "es": "Spanish", "it": "Italian",
    "pt": "Portuguese", "nl": "Dutch", "ru": "Russian", "zh": "Chinese",
    "ja": "Japanese", "ko": "Korean", "ar": "Arabic", "he": "Hebrew",
    "la": "Latin", "el": "Greek", "en": "English",
}
source_name = LANG_NAMES.get(source_lang, source_lang)
target_name = LANG_NAMES.get(target_lang, target_lang)


def llm_translate(text, source_name, target_name, system_prompt=None, model="claude-opus-4-6"):
    """Send a translation request to Claude and return the response text."""
    if system_prompt is None:
        system_prompt = (
            f"You are an expert translator specializing in academic and "
            f"scholarly texts. Translate the following {source_name} text "
            f"into clear, accurate {target_name}. Preserve the author's "
            f"tone and register. Output only the translation."
        )

    response = client.messages.create(
        model=model,
        max_tokens=2048,
        system=system_prompt,
        messages=[{"role": "user", "content": text}],
    )
    return response.content[0].text


llm_result = llm_translate(sample_text, source_name, target_name)

print("=" * 60)
print("LLM TRANSLATION (Claude)")
print("=" * 60)
print(llm_result)

============================================================
LLM TRANSLATION (Claude)
============================================================
Even the most perfect reproduction of a work of art is lacking in one element: its presence in time and space, its unique existence at the place where it happens to be. This unique existence of the work of art determined the history to which it was subject throughout the time of its existence.

annotated_system_prompt = f"""
You are an expert translator and scholarly reader. Given a passage in
{source_name}, produce a structured response with these sections:

## Translation
A fluent, accurate {target_name} translation that preserves the author's
register and argumentation style.

## Key Terms
A short glossary of important terms in the original that carry specific
philosophical, technical, or disciplinary weight. For each, give:
- The original term
- Your translation choice
- A brief note on why this term matters or what nuance it carries

## Argument Summary
In 2-3 sentences, describe the main claim or move the author is making
in this passage. This helps the reader orient before engaging closely
with the text.

## Translation Notes
Flag any places where the translation required significant interpretive
choices — where a different translation would change the meaning, or
where the original is ambiguous.
"""

annotated_result = llm_translate(
    sample_text,
    source_name,
    target_name,
    system_prompt=annotated_system_prompt
)

# Display with Markdown rendering in Colab
from IPython.display import Markdown, display
display(Markdown(annotated_result))

parallel_prompt = f"""
You are an expert translator. Given the following {source_name} text,
produce a sentence-by-sentence parallel translation.

For each sentence, output a Markdown table row with the original on the
left and the {target_name} translation on the right.

Start with:
| {source_name} | {target_name} |
|---|---|

Then one row per sentence. Output only the table.
"""

parallel_result = llm_translate(
    sample_text,
    source_name,
    target_name,
    system_prompt=parallel_prompt
)

display(Markdown(parallel_result))

comparison_md = f"""
## Original ({source_name})

> {sample_text}

---

### Google Translate (`deep-translator`)

{google_result}

---

### MarianMT (local open-source model)

{marian_result}

---

### Claude (LLM)

{llm_result}
"""

display(Markdown(comparison_md))

def translate_long_text_google(text, source="de", target="en", chunk_size=4500):
    """
    Translate a long text using Google Translate, respecting the
    character limit by splitting on paragraph boundaries.
    """
    translator = GoogleTranslator(source=source, target=target)
    paragraphs = text.split("\n\n")

    # Group paragraphs into chunks under the character limit
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) + 2 > chunk_size:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = para
        else:
            current_chunk = current_chunk + "\n\n" + para if current_chunk else para
    if current_chunk:
        chunks.append(current_chunk)

    print(f"Translating {len(chunks)} chunk(s)...")
    translated_chunks = []
    for i, chunk in enumerate(chunks):
        print(f"  Chunk {i+1}/{len(chunks)} ({len(chunk)} chars)")
        translated_chunks.append(translator.translate(chunk))

    return "\n\n".join(translated_chunks)


def translate_long_text_llm(text, source_name, target_name, chunk_size=3000):
    """
    Translate a long text using Claude, chunking by paragraph groups.
    Uses a lower chunk size to leave room for the system prompt and response.
    """
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) + 2 > chunk_size:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = para
        else:
            current_chunk = current_chunk + "\n\n" + para if current_chunk else para
    if current_chunk:
        chunks.append(current_chunk)

    print(f"Translating {len(chunks)} chunk(s) with Claude...")
    translated_chunks = []
    for i, chunk in enumerate(chunks):
        print(f"  Chunk {i+1}/{len(chunks)} ({len(chunk)} chars)")
        result = llm_translate(chunk, source_name, target_name)
        translated_chunks.append(result)

    return "\n\n".join(translated_chunks)


# Example: upload a .txt file in Colab and translate it
# from google.colab import files
# uploaded = files.upload()  # upload a .txt file
# filename = list(uploaded.keys())[0]
# with open(filename, "r", encoding="utf-8") as f:
#     long_text = f.read()
# result = translate_long_text_llm(long_text, source_name, target_name)
# print(result)

print("Functions defined. Uncomment the upload block above to translate your own files.")

Functions defined. Uncomment the upload block above to translate your own files.

# Option: Mount Google Drive and load a text file
# from google.colab import drive
# drive.mount('/content/drive')
# with open('/content/drive/MyDrive/your_file.txt', 'r', encoding='utf-8') as f:
#     your_text = f.read()
# print(f"Loaded {len(your_text)} characters")

Approach	Best For	Tradeoffs
1. `deep-translator`	Quick paragraph-level translation via Google Translate, DeepL, etc.	Fast and free; no context awareness
2. Open-source models (MarianMT)	Offline/reproducible translation with no API keys	Runs locally; variable quality by language pair
3. LLM-based translation (Claude)	Context-aware, scholarly translation with annotations	Highest quality for nuanced text; requires API key

Scenario	Recommended Approach
Quick look at what a passage says	Google Translate via `deep-translator`
Translating a large corpus for distant reading	MarianMT (fast, free, reproducible)
Understanding a difficult theoretical argument	Claude with annotated prompt
Comparing how key terms are rendered	Claude parallel display + key terms
You need the translation to be citable / fixed	MarianMT (deterministic output)
Working offline or with sensitive materials	MarianMT (nothing leaves your machine)

Translating Research Texts: Beyond Copy-Paste¶

Setup¶

Our Sample Text¶

Approach 1: `deep-translator` (Google Translate wrapper)¶

Comparing multiple engines¶

Approach 2: Open-Source Models (Helsinki-NLP / MarianMT)¶

Finding available language pairs¶

Approach 3: LLM-Based Translation (Claude)¶

3a. Straightforward translation¶

3b. Annotated scholarly translation¶

Translation¶

Key Terms¶

Argument Summary¶

Translation Notes¶

3c. Bilingual parallel display¶

Side-by-Side Comparison¶

Original (German)¶

Google Translate (`deep-translator`)¶

MarianMT (local open-source model)¶

Claude (LLM)¶

Working With Longer Texts¶

Bringing Your Own Text¶

Summary: When to Use What¶

German	English
Noch bei der höchstvollendeten Reproduktion fällt eines aus: das Hier und Jetzt des Kunstwerks – sein einmaliges Dasein an dem Orte, an dem es sich befindet.	Even in the most perfect reproduction, one thing falls away: the here and now of the work of art – its unique existence in the place where it happens to be.
An diesem einmaligen Dasein aber und an nichts sonst vollzog sich die Geschichte, der es im Laufe seines Bestehens unterworfen gewesen ist.	Yet it is to this unique existence – and to nothing else – that the history to which the work of art has been subject in the course of its existence was enacted upon.

Translating Research Texts: Beyond Copy-Paste¶

Setup¶

Our Sample Text¶

Approach 1: deep-translator (Google Translate wrapper)¶

Comparing multiple engines¶

Approach 2: Open-Source Models (Helsinki-NLP / MarianMT)¶

Finding available language pairs¶

Approach 3: LLM-Based Translation (Claude)¶

3a. Straightforward translation¶

3b. Annotated scholarly translation¶

Translation¶

Key Terms¶

Argument Summary¶

Translation Notes¶

3c. Bilingual parallel display¶

Side-by-Side Comparison¶

Original (German)¶

Google Translate (deep-translator)¶

MarianMT (local open-source model)¶

Claude (LLM)¶

Working With Longer Texts¶

Bringing Your Own Text¶

Summary: When to Use What¶

Approach 1: `deep-translator` (Google Translate wrapper)¶

Google Translate (`deep-translator`)¶