Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,7 @@
}
},
"outputs": [],
"source": [
"with open(\"war_and_peace_excerpt.txt\") as file:\n",
" war_and_peace = file.read()\n",
" print(war_and_peace)"
]
"source": "# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed\nwith open(\"war-and-peace-excerpt.txt\") as file:\n war_and_peace = file.read()\n print(war_and_peace)"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -189,21 +185,7 @@
}
},
"outputs": [],
"source": [
"# downloading our multilingual sentence tokenizer\n",
"python -m spacy download xx_sent_ud_sm\n",
"\n",
"# loading the multilingual sentence tokenizer we just downloaded\n",
"nlp = spacy.load(\"xx_sent_ud_sm\")\n",
"# applying the spaCy model to our text variable\n",
"doc = nlp(cleaned_war_and_peace)\n",
"\n",
"# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\n",
"spacy_sentences = list(doc.sents)\n",
"\n",
"# printing the sentences to our console\n",
"print(spacy_sentences)"
]
"source": "# downloading our multilingual sentence tokenizer\n!python -m spacy download xx_sent_ud_sm\n\n# loading the multilingual sentence tokenizer we just downloaded\nnlp = spacy.load(\"xx_sent_ud_sm\")\n# applying the spaCy model to our text variable\ndoc = nlp(cleaned_war_and_peace)\n\n# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\nspacy_sentences = list(doc.sents)\n\n# printing the sentences to our console\nprint(spacy_sentences)"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -321,32 +303,7 @@
}
},
"outputs": [],
"source": [
"# first, we install the spacy_langdetect package from the Python Package Index\n",
"pip install spacy_langdetect\n",
"\n",
"# then we import it and use it to detect our languages\n",
"from spacy.language import Language\n",
"from spacy_langdetect import LanguageDetector\n",
"\n",
"# setting up our language detector to work with spaCy\n",
"# def get_lang_detector(nlp, name):\n",
"# return LanguageDetector()\n",
"\n",
"# setting up our pipeline\n",
"Language.factory(\"language_detector\")\n",
"nlp.add_pipe('language_detector', last=True)\n",
"\n",
"# running the language detection on each sentence and printing the results\n",
"rus_doc = nlp(spacy_rus_sent)\n",
"print(rus_doc._.language)\n",
"\n",
"fre_doc = nlp(spacy_fre_sent)\n",
"print(fre_doc._.language)\n",
"\n",
"multi_doc = nlp(spacy_multi_sent)\n",
"print(multi_doc._.language)"
]
"source": "# First, install the spacy_langdetect package from the Python Package Index.\n!pip install spacy_langdetect\n\n# Then, import it and use it to detect our languages.\nfrom spacy.language import Language\nfrom spacy_langdetect import LanguageDetector\n\nnlp = spacy.load(\"xx_sent_ud_sm\")\n\n# Create the language detector function\n@Language.factory(\"language_detector\")\ndef create_language_detector(nlp, name):\n return LanguageDetector()\n\n# add the tool to our pipeline\nnlp.add_pipe('language_detector', last=True)\n\n# running the language detection on each sentence and printing the results\nrus_doc = nlp(spacy_rus_sent)\nprint(rus_doc._.language)\n\nfre_doc = nlp(spacy_fre_sent)\nprint(fre_doc._.language)\n\nmulti_doc = nlp(spacy_multi_sent)\nprint(multi_doc._.language)"
},
{
"cell_type": "code",
Expand All @@ -357,22 +314,7 @@
}
},
"outputs": [],
"source": [
"# importing our models required for language detection\n",
"from stanza.models.common.doc import Document\n",
"from stanza.pipeline.core import Pipeline\n",
"\n",
"# setting up our pipeline\n",
"nlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n",
"\n",
"# specifying which sentences to run the detection on, then running the detection code\n",
"docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n",
"docs = [Document([], text=text) for text in docs]\n",
"nlp(docs)\n",
"\n",
"# printing the text of each sentence alongside the language estimates\n",
"print(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))"
]
"source": "# importing our models required for language detection\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\n\n# setting up our pipeline\nnlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n\n# specifying which sentences to run the detection on, then running the detection code\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = [Document([], text=text) for text in docs]\ndocs = nlp(docs)\n\n# printing the text of each sentence alongside the language estimates\nprint(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -473,21 +415,7 @@
}
},
"outputs": [],
"source": [
"# downloading our Russian model from spaCy\n",
"python -m spacy download ru_core_news_sm\n",
"\n",
"\n",
"# loading the model\n",
"nlp = spacy.load(\"ru_core_news_sm\")\n",
"\n",
"# applying the model\n",
"doc = nlp(spacy_rus_sent)\n",
"\n",
"# printing the text of each word and its POS tag\n",
"for token in doc:\n",
" print(token.text, token.pos_)"
]
"source": "# downloading our Russian model from spaCy\n!python -m spacy download ru_core_news_sm\n\n\n# loading the model\nnlp = spacy.load(\"ru_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_rus_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)"
},
{
"cell_type": "code",
Expand All @@ -498,21 +426,7 @@
}
},
"outputs": [],
"source": [
"# downloading our French model from spaCy\n",
"python -m spacy download fr_core_news_sm\n",
"\n",
"\n",
"# loading the corpus\n",
"nlp = spacy.load(\"fr_core_news_sm\")\n",
"\n",
"# applying the model\n",
"doc = nlp(spacy_fre_sent)\n",
"\n",
"# printing the text of each word and its POS tag\n",
"for token in doc:\n",
" print(token.text, token.pos_)"
]
"source": "# downloading our French model from spaCy\n!python -m spacy download fr_core_news_sm\n\n\n# loading the corpus\nnlp = spacy.load(\"fr_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_fre_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -584,22 +498,7 @@
}
},
"outputs": [],
"source": [
"# loading and applying the model\n",
"nlp = spacy.load(\"ru_core_news_sm\")\n",
"doc = nlp(cyr_no_extra_space)\n",
"\n",
"# printing the text of each word and its POS tag\n",
"for token in doc:\n",
" print(token.text, token.pos_)\n",
"\n",
"# and doing the same with our French sentence\n",
"nlp = spacy.load(\"fr_core_news_sm\")\n",
"doc = nlp(lat_no_extra_space)\n",
"for token in doc:\n",
" print(token.text, token.pos_)\n",
"```"
]
"source": "# loading and applying the model\nnlp = spacy.load(\"ru_core_news_sm\")\ndoc = nlp(cyr_no_extra_space)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)\n\n# and doing the same with our French sentence\nnlp = spacy.load(\"fr_core_news_sm\")\ndoc = nlp(lat_no_extra_space)\nfor token in doc:\n print(token.text, token.pos_)"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -646,20 +545,7 @@
}
},
"outputs": [],
"source": [
"# imports so we can use Stanza's MultilingualPipeline\n",
"from stanza.models.common.doc import Document\n",
"from stanza.pipeline.core import Pipeline\n",
"from stanza.pipeline.multilingual import MultilingualPipeline\n",
"\n",
"# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\n",
"nlp = MultilingualPipeline(processors='tokenize,pos')\n",
"docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n",
"nlp(docs)\n",
"\n",
"# printing the results\n",
"print(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')"
]
"source": "# imports so we can use Stanza's MultilingualPipeline\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\nfrom stanza.pipeline.multilingual import MultilingualPipeline\n\n# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\nnlp = MultilingualPipeline(processors='tokenize,pos')\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = nlp(docs)\n\n# printing the results\nprint(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -768,4 +654,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
35 changes: 23 additions & 12 deletions en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ reviewers:
editors:
- Laura Alice Chapot
review-ticket: https://github.com/programminghistorian/ph-submissions/issues/612
lesson-testers: Émilien Schultz
tested-date: 2026-04-24
difficulty: 2
activity: analyzing
topics: [python, data-manipulation, distant-reading]
Expand All @@ -22,6 +24,10 @@ doi: 10.46430/phen0121

{% include toc.html %}

<div class="alert alert-info">>
This lesson has been updated in April 2026 to reflect changes in the libraries used. The code has been adapted for Python 3.12, spaCy 3.8.11 and Stanza 1.11.1 (the original version was based on Python 3.10, spaCy 3.7.4 and Stanza 1.8.2). In particular, the section on language detection using <code>spacy_langdetect</code> has been rewritten to follow the new <code>@Language.factory</code> pattern required by recent versions of spaCy, and the sentence indices used in the spaCy tokenisation examples have been adjusted.
</div>

## Lesson Goals

Many of the resources available for learning computational methods of text analysis focus on English-language texts and corpora, and often lack the information which is needed to work with non-English source material. To help remedy this, this lesson will provide an introduction to analyzing non-English and multilingual text (that is, text written in more than one language) using Python. Using a multilingual text composed of Russian and French, this lesson will show how you can use computational methods to perform three fundamental preprocessing tasks: tokenization, part-of-speech tagging, and lemmatization. Then, it will teach you to automatically detect the languages present in a preprocessed text.
Expand All @@ -34,7 +40,7 @@ To perform the three fundamental preprocessing steps, this lesson uses three com

This lesson is aimed at those who are unfamiliar with text analysis methods, particularly those who wish to apply such methods to multilingual corpora or texts not written in English. While prior knowledge of Python is not required, it will be helpful to understand the structure of the code. Having a basic knowledge of Python syntax and features is recommended – it would be useful, for example, for the reader to have familiarity with importing libraries, constructing functions and loops, and manipulating strings.

Code for this tutorial is written in Python 3.10 and uses the NLTK (v3.8.1), spaCy (v3.7.4), and Stanza (v1.8.2) libraries to perform its text processing. If you are entirely new to Python, [this _Programming Historian_ lesson](/en/lessons/introduction-and-installation) will be helpful to read before completing this lesson.
Code for this tutorial is written in Python 3.12 and uses the NLTK (v3.8.1), spaCy (v3.8.11), and Stanza (v1.11.1) libraries to perform its text processing. If you are entirely new to Python, [this _Programming Historian_ lesson](/en/lessons/introduction-and-installation) will be helpful to read before completing this lesson.

## Installation and Setup

Expand Down Expand Up @@ -141,7 +147,8 @@ war_and_peace = """
First, let's load our text file so we can use it with our analysis packages. To start, you'll open the file and assign it to the variable named `war_and_peace`, so we can reference it later on. Then, you'll print the contents of the file to make sure it was read correctly. For the purposes of this tutorial, we are using a short excerpt from the novel.

```python
with open("war_and_peace_excerpt.txt") as file:
# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed
with open("war-and-peace-excerpt.txt") as file:
war_and_peace = file.read()
print(war_and_peace)
```
Expand Down Expand Up @@ -360,20 +367,24 @@ As we can see, TextCat correctly identified the Russian and French sentences. Si

We'll examine other ways to detect the languages in multilingual sentences after we've perform our sentence classification using spaCy and Stanza.

Let's try spaCy first. First, we install the `spacy_langdetect` package from the Python Package Index:
Let's try spaCy first.

```python
pip install spacy_langdetect
```
# First, install the `spacy_langdetect` package from the Python Package Index.
!pip install spacy_langdetect

Then we import it and use it to detect our languages:

```python
# Then, import it and use it to detect our languages.
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# setting up our pipeline
Language.factory("language_detector")
nlp = spacy.load("xx_sent_ud_sm")

# Create the language detector function
@Language.factory("language_detector")
def create_language_detector(nlp, name):
return LanguageDetector()

# add the tool to our pipeline
nlp.add_pipe('language_detector', last=True)

# running the language detection on each sentence and printing the results
Expand Down Expand Up @@ -409,7 +420,7 @@ nlp = Pipeline(lang="multilingual", processors="langid")
# specifying which sentences to run the detection on, then running the detection code
docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
docs = [Document([], text=text) for text in docs]
nlp(docs)
docs = nlp(docs)

# printing the text of each sentence alongside the language estimates
print("\n".join(f"{doc.text}\t{doc.lang}" for doc in docs))
Expand Down Expand Up @@ -782,7 +793,7 @@ from stanza.pipeline.multilingual import MultilingualPipeline
# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously
nlp = MultilingualPipeline(processors='tokenize,pos')
docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]
nlp(docs)
docs = nlp(docs)

# printing the results
print(*[f'word: {word.text}\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')
Expand Down
Loading