diff --git a/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb b/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb index 09179fd6a8..4c92313c4e 100644 --- a/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb +++ b/assets/analyzing-multilingual-text-nltk-spacy-stanza/analyzing-multilingual-text-nltk-spacy-stanza.ipynb @@ -65,11 +65,7 @@ } }, "outputs": [], - "source": [ - "with open(\"war_and_peace_excerpt.txt\") as file:\n", - " war_and_peace = file.read()\n", - " print(war_and_peace)" - ] + "source": "# there is no need to run the cell loading the war-and-peace-excerpt.txt file if the previous cell, which directly assigns the text to the war_and_peace variable without downloading a file, has already been executed\nwith open(\"war-and-peace-excerpt.txt\") as file:\n war_and_peace = file.read()\n print(war_and_peace)" }, { "cell_type": "code", @@ -189,21 +185,7 @@ } }, "outputs": [], - "source": [ - "# downloading our multilingual sentence tokenizer\n", - "python -m spacy download xx_sent_ud_sm\n", - "\n", - "# loading the multilingual sentence tokenizer we just downloaded\n", - "nlp = spacy.load(\"xx_sent_ud_sm\")\n", - "# applying the spaCy model to our text variable\n", - "doc = nlp(cleaned_war_and_peace)\n", - "\n", - "# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\n", - "spacy_sentences = list(doc.sents)\n", - "\n", - "# printing the sentences to our console\n", - "print(spacy_sentences)" - ] + "source": "# downloading our multilingual sentence tokenizer\n!python -m spacy download xx_sent_ud_sm\n\n# loading the multilingual sentence tokenizer we just downloaded\nnlp = spacy.load(\"xx_sent_ud_sm\")\n# applying the spaCy model to our text variable\ndoc = nlp(cleaned_war_and_peace)\n\n# assigning the tokenized sentences to a list so it's easier for us to manipulate them later\nspacy_sentences = list(doc.sents)\n\n# printing the sentences to our console\nprint(spacy_sentences)" }, { "cell_type": "code", @@ -321,32 +303,7 @@ } }, "outputs": [], - "source": [ - "# first, we install the spacy_langdetect package from the Python Package Index\n", - "pip install spacy_langdetect\n", - "\n", - "# then we import it and use it to detect our languages\n", - "from spacy.language import Language\n", - "from spacy_langdetect import LanguageDetector\n", - "\n", - "# setting up our language detector to work with spaCy\n", - "# def get_lang_detector(nlp, name):\n", - "# return LanguageDetector()\n", - "\n", - "# setting up our pipeline\n", - "Language.factory(\"language_detector\")\n", - "nlp.add_pipe('language_detector', last=True)\n", - "\n", - "# running the language detection on each sentence and printing the results\n", - "rus_doc = nlp(spacy_rus_sent)\n", - "print(rus_doc._.language)\n", - "\n", - "fre_doc = nlp(spacy_fre_sent)\n", - "print(fre_doc._.language)\n", - "\n", - "multi_doc = nlp(spacy_multi_sent)\n", - "print(multi_doc._.language)" - ] + "source": "# First, install the spacy_langdetect package from the Python Package Index.\n!pip install spacy_langdetect\n\n# Then, import it and use it to detect our languages.\nfrom spacy.language import Language\nfrom spacy_langdetect import LanguageDetector\n\nnlp = spacy.load(\"xx_sent_ud_sm\")\n\n# Create the language detector function\n@Language.factory(\"language_detector\")\ndef create_language_detector(nlp, name):\n return LanguageDetector()\n\n# add the tool to our pipeline\nnlp.add_pipe('language_detector', last=True)\n\n# running the language detection on each sentence and printing the results\nrus_doc = nlp(spacy_rus_sent)\nprint(rus_doc._.language)\n\nfre_doc = nlp(spacy_fre_sent)\nprint(fre_doc._.language)\n\nmulti_doc = nlp(spacy_multi_sent)\nprint(multi_doc._.language)" }, { "cell_type": "code", @@ -357,22 +314,7 @@ } }, "outputs": [], - "source": [ - "# importing our models required for language detection\n", - "from stanza.models.common.doc import Document\n", - "from stanza.pipeline.core import Pipeline\n", - "\n", - "# setting up our pipeline\n", - "nlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n", - "\n", - "# specifying which sentences to run the detection on, then running the detection code\n", - "docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n", - "docs = [Document([], text=text) for text in docs]\n", - "nlp(docs)\n", - "\n", - "# printing the text of each sentence alongside the language estimates\n", - "print(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))" - ] + "source": "# importing our models required for language detection\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\n\n# setting up our pipeline\nnlp = Pipeline(lang=\"multilingual\", processors=\"langid\")\n\n# specifying which sentences to run the detection on, then running the detection code\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = [Document([], text=text) for text in docs]\ndocs = nlp(docs)\n\n# printing the text of each sentence alongside the language estimates\nprint(\"\\n\".join(f\"{doc.text}\\t{doc.lang}\" for doc in docs))" }, { "cell_type": "code", @@ -473,21 +415,7 @@ } }, "outputs": [], - "source": [ - "# downloading our Russian model from spaCy\n", - "python -m spacy download ru_core_news_sm\n", - "\n", - "\n", - "# loading the model\n", - "nlp = spacy.load(\"ru_core_news_sm\")\n", - "\n", - "# applying the model\n", - "doc = nlp(spacy_rus_sent)\n", - "\n", - "# printing the text of each word and its POS tag\n", - "for token in doc:\n", - " print(token.text, token.pos_)" - ] + "source": "# downloading our Russian model from spaCy\n!python -m spacy download ru_core_news_sm\n\n\n# loading the model\nnlp = spacy.load(\"ru_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_rus_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)" }, { "cell_type": "code", @@ -498,21 +426,7 @@ } }, "outputs": [], - "source": [ - "# downloading our French model from spaCy\n", - "python -m spacy download fr_core_news_sm\n", - "\n", - "\n", - "# loading the corpus\n", - "nlp = spacy.load(\"fr_core_news_sm\")\n", - "\n", - "# applying the model\n", - "doc = nlp(spacy_fre_sent)\n", - "\n", - "# printing the text of each word and its POS tag\n", - "for token in doc:\n", - " print(token.text, token.pos_)" - ] + "source": "# downloading our French model from spaCy\n!python -m spacy download fr_core_news_sm\n\n\n# loading the corpus\nnlp = spacy.load(\"fr_core_news_sm\")\n\n# applying the model\ndoc = nlp(spacy_fre_sent)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)" }, { "cell_type": "code", @@ -584,22 +498,7 @@ } }, "outputs": [], - "source": [ - "# loading and applying the model\n", - "nlp = spacy.load(\"ru_core_news_sm\")\n", - "doc = nlp(cyr_no_extra_space)\n", - "\n", - "# printing the text of each word and its POS tag\n", - "for token in doc:\n", - " print(token.text, token.pos_)\n", - "\n", - "# and doing the same with our French sentence\n", - "nlp = spacy.load(\"fr_core_news_sm\")\n", - "doc = nlp(lat_no_extra_space)\n", - "for token in doc:\n", - " print(token.text, token.pos_)\n", - "```" - ] + "source": "# loading and applying the model\nnlp = spacy.load(\"ru_core_news_sm\")\ndoc = nlp(cyr_no_extra_space)\n\n# printing the text of each word and its POS tag\nfor token in doc:\n print(token.text, token.pos_)\n\n# and doing the same with our French sentence\nnlp = spacy.load(\"fr_core_news_sm\")\ndoc = nlp(lat_no_extra_space)\nfor token in doc:\n print(token.text, token.pos_)" }, { "cell_type": "code", @@ -646,20 +545,7 @@ } }, "outputs": [], - "source": [ - "# imports so we can use Stanza's MultilingualPipeline\n", - "from stanza.models.common.doc import Document\n", - "from stanza.pipeline.core import Pipeline\n", - "from stanza.pipeline.multilingual import MultilingualPipeline\n", - "\n", - "# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\n", - "nlp = MultilingualPipeline(processors='tokenize,pos')\n", - "docs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\n", - "nlp(docs)\n", - "\n", - "# printing the results\n", - "print(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')" - ] + "source": "# imports so we can use Stanza's MultilingualPipeline\nfrom stanza.models.common.doc import Document\nfrom stanza.pipeline.core import Pipeline\nfrom stanza.pipeline.multilingual import MultilingualPipeline\n\n# running the multilingual pipeline on our French, Russian, and multilingual sentences simultaneously\nnlp = MultilingualPipeline(processors='tokenize,pos')\ndocs = [stanza_rus_sent, stanza_fre_sent, stanza_multi_sent]\ndocs = nlp(docs)\n\n# printing the results\nprint(*[f'word: {word.text}\\tupos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\\n')" }, { "cell_type": "markdown", @@ -768,4 +654,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md index e4ff69de40..179ed4ac9c 100644 --- a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md +++ b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md @@ -12,6 +12,8 @@ reviewers: editors: - Laura Alice Chapot review-ticket: https://github.com/programminghistorian/ph-submissions/issues/612 +lesson-testers: Émilien Schultz +tested-date: 2026-04-24 difficulty: 2 activity: analyzing topics: [python, data-manipulation, distant-reading] @@ -22,6 +24,10 @@ doi: 10.46430/phen0121 {% include toc.html %} +
spacy_langdetect has been rewritten to follow the new @Language.factory pattern required by recent versions of spaCy, and the sentence indices used in the spaCy tokenisation examples have been adjusted.
+