gnu: Add python-stanza.

* gnu/packages/machine-learning.scm (python-stanza): New variable. Change-Id: I86369771db647c85d7a204ff8069d6e3670bb58b
author: Nguyễn Gia Phong <cnx@loang.net> 2026-01-12 14:40:26 +0900
committer: Cayetano Santos <csantosb@inventati.org> 2026-01-31 09:54:48 +0100
commit: a2f5c702f17586932458c3f0321f527f346f9761 (patch)
tree: 02db76e549a58ebfd7eb9f8d11338d55dd490f62 /gnu
parent: 9f6f7fa0cb4e1036f10054c51e4803acf5220b8c (diff)
1 files changed, 67 insertions, 0 deletions
diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index a260c523b80..8481afc0e49 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -33,6 +33,7 @@
 ;;; Copyright © 2025 Cayetano Santos <csantosb@inventati.org>
 ;;; Copyright © 2025 Janneke Nieuwenhuizen <janneke@gnu.org>
 ;;; Copyright © 2025 Romain Garbage <romain.garbage@inria.fr>
+;;; Copyright © 2026 Nguyễn Gia Phong <cnx@loang.net>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -1867,6 +1868,72 @@ transformers like BERT, as well as a production-ready training system and easy
 model packaging, deployment and workflow management.")
     (license license:expat)))
 
+(define-public python-stanza
+  (package
+    (name "python-stanza")
+    (version "1.10.1")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+             (url "https://github.com/stanfordnlp/stanza")
+             (commit (string-append "v" version))))
+       (sha256
+        (base32 "0zcpzmbv0aafircl12m3x5999hxpg2hzm1xxv97pz09y4v589snj"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:phases
+      #~(modify-phases %standard-phases
+          (add-before 'check 'set-up-check
+            (lambda _
+              ;; Cherry pick from stanza/tests/setup.py,
+              ;; which downloads many datasets
+              (mkdir-p "stanza_test/out")
+              (copy-file "stanza/tests/data/example_french.json"
+                         "stanza_test/out/example_french.json"))))
+      ;; tests: 288 passed, 361 deselected, 1 warning
+      #:test-flags
+      #~(list
+         "-k"
+         (string-join
+          '("not CoreNLP" "EnglishPipeline" "FrenchPipeline"
+            "SentimentPipeline" "TestTrainer"
+            "amt_annotator" "arabic_pos" "bert"
+            "charlm" "conllu" "convert_units"
+            "data_objects" "defaultdict_config"
+            "depparse" "dictionary" "download"
+            "ensemble" "example" "finetune" "install"
+            "langid" "lemmatizer" "long_paragraph" "long_tokens"
+            "model" "morphology" "multilingual" "mwt"
+            "pipeline_" "pretrain" "process_doc"
+            "read_snippets" "register" "reload"
+            "requirements" "resources" "retag"
+            "score" "semgrex" "serialized" "server_" "ssurgeon"
+            "tagger" "test_core" "test_one_sentence" "test_tokenizer"
+            "text_processing" "tokenize_files" "tokensregex"
+            "train_pipeline" "training" "tsurgeon")
+          " and not ")                  ;exclude tests requiring datasets
+         "stanza/tests")))
+    (native-inputs (list python-pytest
+                         python-setuptools
+                         python-transformers))
+    (propagated-inputs (list python-emoji
+                             python-networkx
+                             python-numpy
+                             python-protobuf
+                             python-pytorch
+                             python-requests
+                             python-tqdm))
+    (home-page "https://stanfordnlp.github.io/stanza/")
+    (synopsis "Stanford NLP Python library for many human languages")
+    (description
+     "Stanza is a collection of accurate and efficient tools
+for the linguistic analysis of many human languages.  Starting from raw text,
+Stanza divides it into sentences and words, and then can recognize
+parts of speech and entities, do syntactic analysis, and more.")
+    (license license:asl2.0)))
+
 (define-public onnx
   (package
     (name "onnx")
author	Nguyễn Gia Phong <cnx@loang.net>	2026-01-12 14:40:26 +0900
committer	Cayetano Santos <csantosb@inventati.org>	2026-01-31 09:54:48 +0100
commit	a2f5c702f17586932458c3f0321f527f346f9761 (patch)
tree	02db76e549a58ebfd7eb9f8d11338d55dd490f62 /gnu
parent	9f6f7fa0cb4e1036f10054c51e4803acf5220b8c (diff)