programminghistorian · anisa-hawes · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/...g-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb b/...g-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb
@@ -65,11 +65,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "with open(\"war_and_peace_excerpt.txt\") as file:\n",
-    "    war_and_peace = file.read()\n",
-    "    print(war_and_peace)"
-   ]
+   "source": "# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed\nwith open(\"war-and-peace-excerpt.txt\") as file:\n    war_and_peace = file.read()\n    print(war_and_peace)"
   },
   {
    "cell_type": "code",
@@ -189,21 +185,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# downloading our multilingual sentence tokenizer\n",
-    "python -m spacy download xx_sent_ud_sm\n",
-    "\n",
-    "# loading the multilingual sentence tokenizer we just downloaded\n",
-    "nlp = spacy.load(\"xx_sent_ud_sm\")\n",
-    "# applying the spaCy model to our text variable\n",
-    "doc = nlp(cleaned_war_and_peace)\n",
-    "\n",
-    "# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\n",
-    "spacy_sentences = list(doc.sents)\n",
-    "\n",
-    "# printing the sentences to our console\n",
-    "print(spacy_sentences)"
-   ]
+   "source": "# downloading our multilingual sentence tokenizer\n!python -m spacy download xx_sent_ud_sm\n\n# loading the multilingual sentence tokenizer we just downloaded\nnlp = spacy.load(\"xx_sent_ud_sm\")\n# applying the spaCy model to our text variable\ndoc = nlp(cleaned_war_and_peace)\n\n# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\nspacy_sentences = list(doc.sents)\n\n# printing the sentences to our console\nprint(spacy_sentences)"
   },
   {
    "cell_type": "code",
@@ -321,32 +303,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# first, we install the spacy_langdetect package from the Python Package Index\n",
-    "pip install spacy_langdetect\n",
-    "\n",
-    "# then we import it and use it to detect our languages\n",
-    "from spacy.language import Language\n",
-    "from spacy_langdetect import LanguageDetector\n",
-    "\n",
-    "# setting up our language detector to work with spaCy\n",
-    "# def get_lang_detector(nlp, name):\n",
-    "#     return LanguageDetector()\n",
-    "\n",
-    "# setting up our pipeline\n",
-    "Language.factory(\"language_detector\")\n",
-    "nlp.add_pipe('language_detector', last=True)\n",
-    "\n",
-    "# running the language detection on each sentence and printing the results\n",
-    "rus_doc = nlp(spacy_rus_sent)\n",
-    "print(rus_doc._.language)\n",
-    "\n",
-    "fre_doc = nlp(spacy_fre_sent)\n",
-    "print(fre_doc._.language)\n",
-    "\n",
-    "multi_doc = nlp(spacy_multi_sent)\n",
-    "print(multi_doc._.language)"
-   ]
+   "source": "# First, install the spacy_langdetect package from the Python Package Index.\n!pip install spacy_langdetect\n\n# Then, import it and use it to detect our languages.\nfrom spacy.language import Language\nfrom spacy_langdetect import LanguageDetector\n\nnlp = spacy.load(\"xx_sent_ud_sm\")\n\n# Create the language detector function\n@Language.factory(\"language_detector\")\ndef create_language_detector(nlp, name):\n    return LanguageDetector()\n\n# add the tool to our pipeline\nnlp.add_pipe('language_detector', last=True)\n\n# running the language detection on each sentence and printing the results\nrus_doc = nlp(spacy_rus_sent)\nprint(rus_doc._.language)\n\nfre_doc = nlp(spacy_fre_sent)\nprint(fre_doc._.language)\n\nmulti_doc = nlp(spacy_multi_sent)\nprint(multi_doc._.language)"
   },
   {
    "cell_type": "code",
@@ -357,22 +314,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# importing our models required for language detection\n",
-    "from stanza.models.common.doc import Document\n",
-    "from stanza.pipeline.core import Pipeline\n",
-    "\n",
-    "# setting up our pipeline\n",
-    "nlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n",
-    "\n",
-    "# specifying which sentences to run the detection on, then running the detection code\n",
-    "docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n",
-    "docs = [Document([], text=text) for text in docs]\n",
-    "nlp(docs)\n",
-    "\n",
-    "# printing the text of each sentence alongside the language estimates\n",
-    "print(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))"
-   ]
+   "source": "# importing our models required for language detection\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\n\n# setting up our pipeline\nnlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n\n# specifying which sentences to run the detection on, then running the detection code\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = [Document([], text=text) for text in docs]\ndocs = nlp(docs)\n\n# printing the text of each sentence alongside the language estimates\nprint(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))"
   },
   {
    "cell_type": "code",
@@ -473,21 +415,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# downloading our Russian model from spaCy\n",
-    "python -m spacy download ru_core_news_sm\n",
-    "\n",
-    "\n",
-    "# loading the model\n",
-    "nlp = spacy.load(\"ru_core_news_sm\")\n",
-    "\n",
-    "# applying the model\n",
-    "doc = nlp(spacy_rus_sent)\n",
-    "\n",
-    "# printing the text of each word and its POS tag\n",
-    "for token in doc:\n",
-    "    print(token.text, token.pos_)"
-   ]
+   "source": "# downloading our Russian model from spaCy\n!python -m spacy download ru_core_news_sm\n\n\n# loading the model\nnlp = spacy.load(\"ru_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_rus_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n    print(token.text, token.pos_)"
   },
   {
    "cell_type": "code",
@@ -498,21 +426,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# downloading our French model from spaCy\n",
-    "python -m spacy download fr_core_news_sm\n",
-    "\n",
-    "\n",
-    "# loading the corpus\n",
-    "nlp = spacy.load(\"fr_core_news_sm\")\n",
-    "\n",
-    "# applying the model\n",
-    "doc = nlp(spacy_fre_sent)\n",
-    "\n",
-    "# printing the text of each word and its POS tag\n",
-    "for token in doc:\n",
-    "    print(token.text, token.pos_)"
-   ]
+   "source": "# downloading our French model from spaCy\n!python -m spacy download fr_core_news_sm\n\n\n# loading the corpus\nnlp = spacy.load(\"fr_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_fre_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n    print(token.text, token.pos_)"
   },
   {
    "cell_type": "code",
@@ -584,22 +498,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# loading and applying the model\n",
-    "nlp = spacy.load(\"ru_core_news_sm\")\n",
-    "doc = nlp(cyr_no_extra_space)\n",
-    "\n",
-    "# printing the text of each word and its POS tag\n",
-    "for token in doc:\n",
-    "    print(token.text, token.pos_)\n",
-    "\n",
-    "# and doing the same with our French sentence\n",
-    "nlp = spacy.load(\"fr_core_news_sm\")\n",
-    "doc = nlp(lat_no_extra_space)\n",
-    "for token in doc:\n",
-    "    print(token.text, token.pos_)\n",
-    "```"
-   ]
+   "source": "# loading and applying the model\nnlp = spacy.load(\"ru_core_news_sm\")\ndoc = nlp(cyr_no_extra_space)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n    print(token.text, token.pos_)\n\n# and doing the same with our French sentence\nnlp = spacy.load(\"fr_core_news_sm\")\ndoc = nlp(lat_no_extra_space)\nfor token in doc:\n    print(token.text, token.pos_)"
   },
   {
    "cell_type": "code",
@@ -646,20 +545,7 @@
     }
    },
    "outputs": [],
-   "source": [
-    "# imports so we can use Stanza's MultilingualPipeline\n",
-    "from stanza.models.common.doc import Document\n",
-    "from stanza.pipeline.core import Pipeline\n",
-    "from stanza.pipeline.multilingual import MultilingualPipeline\n",
-    "\n",
-    "# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\n",
-    "nlp = MultilingualPipeline(processors='tokenize,pos')\n",
-    "docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n",
-    "nlp(docs)\n",
-    "\n",
-    "# printing the results\n",
-    "print(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')"
-   ]
+   "source": "# imports so we can use Stanza's MultilingualPipeline\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\nfrom stanza.pipeline.multilingual import MultilingualPipeline\n\n# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\nnlp = MultilingualPipeline(processors='tokenize,pos')\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = nlp(docs)\n\n# printing the results\nprint(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')"
   },
   {
    "cell_type": "markdown",
@@ -768,4 +654,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md
@@ -12,6 +12,8 @@ reviewers:
 editors:
 - Laura Alice Chapot
 review-ticket: https://github.com/programminghistorian/ph-submissions/issues/612
+lesson-testers: Émilien Schultz
+tested-date: 2026-04-24
 difficulty: 2
 activity: analyzing
 topics: [python, data-manipulation, distant-reading]
@@ -22,6 +24,10 @@ doi: 10.46430/phen0121
 
 {% include toc.html %}
 
+<div class="alert alert-info">>
+This lesson has been updated in April 2026 to reflect changes in the libraries used. The code has been adapted for Python 3.12, spaCy 3.8.11 and Stanza 1.11.1 (the original version was based on Python 3.10, spaCy 3.7.4 and Stanza 1.8.2). In particular, the section on language detection using <code>spacy_langdetect</code> has been rewritten to follow the new <code>@Language.factory</code> pattern required by recent versions of spaCy, and the sentence indices used in the spaCy tokenisation examples have been adjusted.
+</div>
+
 ## Lesson Goals
 
 Many of the resources available for learning computational methods of text analysis focus on English-language texts and corpora, and often lack the information which is needed to work with non-English source material. To help remedy this, this lesson will provide an introduction to analyzing non-English and multilingual text (that is, text written in more than one language) using Python. Using a multilingual text composed of Russian and French, this lesson will show how you can use computational methods to perform three fundamental preprocessing tasks: tokenization, part-of-speech tagging, and lemmatization. Then, it will teach you to automatically detect the languages present in a preprocessed text.
@@ -34,7 +40,7 @@ To perform the three fundamental preprocessing steps, this lesson uses three com
 
 This lesson is aimed at those who are unfamiliar with text analysis methods, particularly those who wish to apply such methods to multilingual corpora or texts not written in English. While prior knowledge of Python is not required, it will be helpful to understand the structure of the code. Having a basic knowledge of Python syntax and features is recommended – it would be useful, for example, for the reader to have familiarity with importing libraries, constructing functions and loops, and manipulating strings.
 
-Code for this tutorial is written in Python 3.10 and uses the NLTK (v3.8.1), spaCy (v3.7.4), and Stanza (v1.8.2) libraries to perform its text processing. If you are entirely new to Python, [this _Programming Historian_ lesson](/en/lessons/introduction-and-installation) will be helpful to read before completing this lesson.
+Code for this tutorial is written in Python 3.12 and uses the NLTK (v3.8.1), spaCy (v3.8.11), and Stanza (v1.11.1) libraries to perform its text processing. If you are entirely new to Python, [this _Programming Historian_ lesson](/en/lessons/introduction-and-installation) will be helpful to read before completing this lesson.
 
 ## Installation and Setup
 
@@ -141,7 +147,8 @@ war_and_peace = """
 First, let's load our text file so we can use it with our analysis packages. To start, you'll open the file and assign it to the variable named `war_and_peace`, so we can reference it later on. Then, you'll print the contents of the file to make sure it was read correctly. For the purposes of this tutorial, we are using a short excerpt from the novel.
 
 ```python
-with open("war_and_peace_excerpt.txt") as file:
+# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed
+with open("war-and-peace-excerpt.txt") as file:
     war_and_peace = file.read()
     print(war_and_peace)
 ```
@@ -360,20 +367,24 @@ As we can see, TextCat correctly identified the Russian and French sentences. Si
 
 We'll examine other ways to detect the languages in multilingual sentences after we've perform our sentence classification using spaCy and Stanza. 
 
-Let's try spaCy first. First, we install the `spacy_langdetect` package from the Python Package Index:
+Let's try spaCy first.
 
 ```python
-pip install spacy_langdetect
-```
+# First, install the `spacy_langdetect` package from the Python Package Index.
+!pip install spacy_langdetect
 
-Then we import it and use it to detect our languages:
-
-```python
+# Then, import it and use it to detect our languages.
 from spacy.language import Language
 from spacy_langdetect import LanguageDetector
 
-# setting up our pipeline
-Language.factory("language_detector")
+nlp = spacy.load("xx_sent_ud_sm")
+
+# Create the language detector function
+@Language.factory("language_detector")
+def create_language_detector(nlp, name):
+    return LanguageDetector()
+
+# add the tool to our pipeline
 nlp.add_pipe('language_detector', last=True)
 
 # running the language detection on each sentence and printing the results
@@ -409,7 +420,7 @@ nlp = Pipeline(lang="multilingual", processors="langid")
 # specifying which sentences to run the detection on, then running the detection code
 docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
 docs = [Document([], text=text) for text in docs]
-nlp(docs)
+docs = nlp(docs)
 
 # printing the text of each sentence alongside the language estimates
 print("\n".join(f"{doc.text}\t{doc.lang}" for doc in docs))
@@ -782,7 +793,7 @@ from stanza.pipeline.multilingual import MultilingualPipeline
 # running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously
 nlp = MultilingualPipeline(processors='tokenize,pos')
 docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
-nlp(docs)
+docs = nlp(docs)
 
 # printing the results
 print(*[f'word: {word.text}\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')