diff --git a/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb b/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb index 09179fd6a8..4c92313c4e 100644 --- a/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb +++ b/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb @@ -65,11 +65,7 @@ } }, "outputs": [], - "source": [ - "with open(\"war_and_peace_excerpt.txt\") as file:\n", - " war_and_peace = file.read()\n", - " print(war_and_peace)" - ] + "source": "# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed\nwith open(\"war-and-peace-excerpt.txt\") as file:\n war_and_peace = file.read()\n print(war_and_peace)" }, { "cell_type": "code", @@ -189,21 +185,7 @@ } }, "outputs": [], - "source": [ - "# downloading our multilingual sentence tokenizer\n", - "python -m spacy download xx_sent_ud_sm\n", - "\n", - "# loading the multilingual sentence tokenizer we just downloaded\n", - "nlp = spacy.load(\"xx_sent_ud_sm\")\n", - "# applying the spaCy model to our text variable\n", - "doc = nlp(cleaned_war_and_peace)\n", - "\n", - "# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\n", - "spacy_sentences = list(doc.sents)\n", - "\n", - "# printing the sentences to our console\n", - "print(spacy_sentences)" - ] + "source": "# downloading our multilingual sentence tokenizer\n!python -m spacy download xx_sent_ud_sm\n\n# loading the multilingual sentence tokenizer we just downloaded\nnlp = spacy.load(\"xx_sent_ud_sm\")\n# applying the spaCy model to our text variable\ndoc = nlp(cleaned_war_and_peace)\n\n# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\nspacy_sentences = list(doc.sents)\n\n# printing the sentences to our console\nprint(spacy_sentences)" }, { "cell_type": "code", @@ -321,32 +303,7 @@ } }, "outputs": [], - "source": [ - "# first, we install the spacy_langdetect package from the Python Package Index\n", - "pip install spacy_langdetect\n", - "\n", - "# then we import it and use it to detect our languages\n", - "from spacy.language import Language\n", - "from spacy_langdetect import LanguageDetector\n", - "\n", - "# setting up our language detector to work with spaCy\n", - "# def get_lang_detector(nlp, name):\n", - "# return LanguageDetector()\n", - "\n", - "# setting up our pipeline\n", - "Language.factory(\"language_detector\")\n", - "nlp.add_pipe('language_detector', last=True)\n", - "\n", - "# running the language detection on each sentence and printing the results\n", - "rus_doc = nlp(spacy_rus_sent)\n", - "print(rus_doc._.language)\n", - "\n", - "fre_doc = nlp(spacy_fre_sent)\n", - "print(fre_doc._.language)\n", - "\n", - "multi_doc = nlp(spacy_multi_sent)\n", - "print(multi_doc._.language)" - ] + "source": "# First, install the spacy_langdetect package from the Python Package Index.\n!pip install spacy_langdetect\n\n# Then, import it and use it to detect our languages.\nfrom spacy.language import Language\nfrom spacy_langdetect import LanguageDetector\n\nnlp = spacy.load(\"xx_sent_ud_sm\")\n\n# Create the language detector function\n@Language.factory(\"language_detector\")\ndef create_language_detector(nlp, name):\n return LanguageDetector()\n\n# add the tool to our pipeline\nnlp.add_pipe('language_detector', last=True)\n\n# running the language detection on each sentence and printing the results\nrus_doc = nlp(spacy_rus_sent)\nprint(rus_doc._.language)\n\nfre_doc = nlp(spacy_fre_sent)\nprint(fre_doc._.language)\n\nmulti_doc = nlp(spacy_multi_sent)\nprint(multi_doc._.language)" }, { "cell_type": "code", @@ -357,22 +314,7 @@ } }, "outputs": [], - "source": [ - "# importing our models required for language detection\n", - "from stanza.models.common.doc import Document\n", - "from stanza.pipeline.core import Pipeline\n", - "\n", - "# setting up our pipeline\n", - "nlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n", - "\n", - "# specifying which sentences to run the detection on, then running the detection code\n", - "docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n", - "docs = [Document([], text=text) for text in docs]\n", - "nlp(docs)\n", - "\n", - "# printing the text of each sentence alongside the language estimates\n", - "print(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))" - ] + "source": "# importing our models required for language detection\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\n\n# setting up our pipeline\nnlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n\n# specifying which sentences to run the detection on, then running the detection code\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = [Document([], text=text) for text in docs]\ndocs = nlp(docs)\n\n# printing the text of each sentence alongside the language estimates\nprint(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))" }, { "cell_type": "code", @@ -473,21 +415,7 @@ } }, "outputs": [], - "source": [ - "# downloading our Russian model from spaCy\n", - "python -m spacy download ru_core_news_sm\n", - "\n", - "\n", - "# loading the model\n", - "nlp = spacy.load(\"ru_core_news_sm\")\n", - "\n", - "# applying the model\n", - "doc = nlp(spacy_rus_sent)\n", - "\n", - "# printing the text of each word and its POS tag\n", - "for token in doc:\n", - " print(token.text, token.pos_)" - ] + "source": "# downloading our Russian model from spaCy\n!python -m spacy download ru_core_news_sm\n\n\n# loading the model\nnlp = spacy.load(\"ru_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_rus_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)" }, { "cell_type": "code", @@ -498,21 +426,7 @@ } }, "outputs": [], - "source": [ - "# downloading our French model from spaCy\n", - "python -m spacy download fr_core_news_sm\n", - "\n", - "\n", - "# loading the corpus\n", - "nlp = spacy.load(\"fr_core_news_sm\")\n", - "\n", - "# applying the model\n", - "doc = nlp(spacy_fre_sent)\n", - "\n", - "# printing the text of each word and its POS tag\n", - "for token in doc:\n", - " print(token.text, token.pos_)" - ] + "source": "# downloading our French model from spaCy\n!python -m spacy download fr_core_news_sm\n\n\n# loading the corpus\nnlp = spacy.load(\"fr_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_fre_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)" }, { "cell_type": "code", @@ -584,22 +498,7 @@ } }, "outputs": [], - "source": [ - "# loading and applying the model\n", - "nlp = spacy.load(\"ru_core_news_sm\")\n", - "doc = nlp(cyr_no_extra_space)\n", - "\n", - "# printing the text of each word and its POS tag\n", - "for token in doc:\n", - " print(token.text, token.pos_)\n", - "\n", - "# and doing the same with our French sentence\n", - "nlp = spacy.load(\"fr_core_news_sm\")\n", - "doc = nlp(lat_no_extra_space)\n", - "for token in doc:\n", - " print(token.text, token.pos_)\n", - "```" - ] + "source": "# loading and applying the model\nnlp = spacy.load(\"ru_core_news_sm\")\ndoc = nlp(cyr_no_extra_space)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)\n\n# and doing the same with our French sentence\nnlp = spacy.load(\"fr_core_news_sm\")\ndoc = nlp(lat_no_extra_space)\nfor token in doc:\n print(token.text, token.pos_)" }, { "cell_type": "code", @@ -646,20 +545,7 @@ } }, "outputs": [], - "source": [ - "# imports so we can use Stanza's MultilingualPipeline\n", - "from stanza.models.common.doc import Document\n", - "from stanza.pipeline.core import Pipeline\n", - "from stanza.pipeline.multilingual import MultilingualPipeline\n", - "\n", - "# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\n", - "nlp = MultilingualPipeline(processors='tokenize,pos')\n", - "docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n", - "nlp(docs)\n", - "\n", - "# printing the results\n", - "print(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')" - ] + "source": "# imports so we can use Stanza's MultilingualPipeline\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\nfrom stanza.pipeline.multilingual import MultilingualPipeline\n\n# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\nnlp = MultilingualPipeline(processors='tokenize,pos')\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = nlp(docs)\n\n# printing the results\nprint(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')" }, { "cell_type": "markdown", @@ -768,4 +654,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md index e4ff69de40..179ed4ac9c 100644 --- a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md +++ b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md @@ -12,6 +12,8 @@ reviewers: editors: - Laura Alice Chapot review-ticket: https://github.com/programminghistorian/ph-submissions/issues/612 +lesson-testers: Émilien Schultz +tested-date: 2026-04-24 difficulty: 2 activity: analyzing topics: [python, data-manipulation, distant-reading] @@ -22,6 +24,10 @@ doi: 10.46430/phen0121 {% include toc.html %} +
> +This lesson has been updated in April 2026 to reflect changes in the libraries used. The code has been adapted for Python 3.12, spaCy 3.8.11 and Stanza 1.11.1 (the original version was based on Python 3.10, spaCy 3.7.4 and Stanza 1.8.2). In particular, the section on language detection using spacy_langdetect has been rewritten to follow the new @Language.factory pattern required by recent versions of spaCy, and the sentence indices used in the spaCy tokenisation examples have been adjusted. +
+ ## Lesson Goals Many of the resources available for learning computational methods of text analysis focus on English-language texts and corpora, and often lack the information which is needed to work with non-English source material. To help remedy this, this lesson will provide an introduction to analyzing non-English and multilingual text (that is, text written in more than one language) using Python. Using a multilingual text composed of Russian and French, this lesson will show how you can use computational methods to perform three fundamental preprocessing tasks: tokenization, part-of-speech tagging, and lemmatization. Then, it will teach you to automatically detect the languages present in a preprocessed text. @@ -34,7 +40,7 @@ To perform the three fundamental preprocessing steps, this lesson uses three com This lesson is aimed at those who are unfamiliar with text analysis methods, particularly those who wish to apply such methods to multilingual corpora or texts not written in English. While prior knowledge of Python is not required, it will be helpful to understand the structure of the code. Having a basic knowledge of Python syntax and features is recommended – it would be useful, for example, for the reader to have familiarity with importing libraries, constructing functions and loops, and manipulating strings. -Code for this tutorial is written in Python 3.10 and uses the NLTK (v3.8.1), spaCy (v3.7.4), and Stanza (v1.8.2) libraries to perform its text processing. If you are entirely new to Python, [this _Programming Historian_ lesson](/en/lessons/introduction-and-installation) will be helpful to read before completing this lesson. +Code for this tutorial is written in Python 3.12 and uses the NLTK (v3.8.1), spaCy (v3.8.11), and Stanza (v1.11.1) libraries to perform its text processing. If you are entirely new to Python, [this _Programming Historian_ lesson](/en/lessons/introduction-and-installation) will be helpful to read before completing this lesson. ## Installation and Setup @@ -141,7 +147,8 @@ war_and_peace = """ First, let's load our text file so we can use it with our analysis packages. To start, you'll open the file and assign it to the variable named `war_and_peace`, so we can reference it later on. Then, you'll print the contents of the file to make sure it was read correctly. For the purposes of this tutorial, we are using a short excerpt from the novel. ```python -with open("war_and_peace_excerpt.txt") as file: +# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed +with open("war-and-peace-excerpt.txt") as file: war_and_peace = file.read() print(war_and_peace) ``` @@ -360,20 +367,24 @@ As we can see, TextCat correctly identified the Russian and French sentences. Si We'll examine other ways to detect the languages in multilingual sentences after we've perform our sentence classification using spaCy and Stanza. -Let's try spaCy first. First, we install the `spacy_langdetect` package from the Python Package Index: +Let's try spaCy first. ```python -pip install spacy_langdetect -``` +# First, install the `spacy_langdetect` package from the Python Package Index. +!pip install spacy_langdetect -Then we import it and use it to detect our languages: - -```python +# Then, import it and use it to detect our languages. from spacy.language import Language from spacy_langdetect import LanguageDetector -# setting up our pipeline -Language.factory("language_detector") +nlp = spacy.load("xx_sent_ud_sm") + +# Create the language detector function +@Language.factory("language_detector") +def create_language_detector(nlp, name): + return LanguageDetector() + +# add the tool to our pipeline nlp.add_pipe('language_detector', last=True) # running the language detection on each sentence and printing the results @@ -409,7 +420,7 @@ nlp = Pipeline(lang="multilingual", processors="langid") # specifying which sentences to run the detection on, then running the detection code docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent] docs = [Document([], text=text) for text in docs] -nlp(docs) +docs = nlp(docs) # printing the text of each sentence alongside the language estimates print("\n".join(f"{doc.text}\t{doc.lang}" for doc in docs)) @@ -782,7 +793,7 @@ from stanza.pipeline.multilingual import MultilingualPipeline # running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously nlp = MultilingualPipeline(processors='tokenize,pos') docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent] -nlp(docs) +docs = nlp(docs) # printing the results print(*[f'word: {word.text}\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')