summaryrefslogtreecommitdiff
path: root/gnu
diff options
context:
space:
mode:
authorNguyễn Gia Phong <cnx@loang.net>2026-01-12 14:40:26 +0900
committerCayetano Santos <csantosb@inventati.org>2026-01-31 09:54:48 +0100
commita2f5c702f17586932458c3f0321f527f346f9761 (patch)
tree02db76e549a58ebfd7eb9f8d11338d55dd490f62 /gnu
parent9f6f7fa0cb4e1036f10054c51e4803acf5220b8c (diff)
gnu: Add python-stanza.
* gnu/packages/machine-learning.scm (python-stanza): New variable. Change-Id: I86369771db647c85d7a204ff8069d6e3670bb58b
Diffstat (limited to 'gnu')
-rw-r--r--gnu/packages/machine-learning.scm67
1 files changed, 67 insertions, 0 deletions
diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index a260c523b80..8481afc0e49 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -33,6 +33,7 @@
;;; Copyright © 2025 Cayetano Santos <csantosb@inventati.org>
;;; Copyright © 2025 Janneke Nieuwenhuizen <janneke@gnu.org>
;;; Copyright © 2025 Romain Garbage <romain.garbage@inria.fr>
+;;; Copyright © 2026 Nguyễn Gia Phong <cnx@loang.net>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -1867,6 +1868,72 @@ transformers like BERT, as well as a production-ready training system and easy
model packaging, deployment and workflow management.")
(license license:expat)))
+(define-public python-stanza
+ (package
+ (name "python-stanza")
+ (version "1.10.1")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/stanfordnlp/stanza")
+ (commit (string-append "v" version))))
+ (sha256
+ (base32 "0zcpzmbv0aafircl12m3x5999hxpg2hzm1xxv97pz09y4v589snj"))))
+ (build-system pyproject-build-system)
+ (arguments
+ (list
+ #:phases
+ #~(modify-phases %standard-phases
+ (add-before 'check 'set-up-check
+ (lambda _
+ ;; Cherry pick from stanza/tests/setup.py,
+ ;; which downloads many datasets
+ (mkdir-p "stanza_test/out")
+ (copy-file "stanza/tests/data/example_french.json"
+ "stanza_test/out/example_french.json"))))
+ ;; tests: 288 passed, 361 deselected, 1 warning
+ #:test-flags
+ #~(list
+ "-k"
+ (string-join
+ '("not CoreNLP" "EnglishPipeline" "FrenchPipeline"
+ "SentimentPipeline" "TestTrainer"
+ "amt_annotator" "arabic_pos" "bert"
+ "charlm" "conllu" "convert_units"
+ "data_objects" "defaultdict_config"
+ "depparse" "dictionary" "download"
+ "ensemble" "example" "finetune" "install"
+ "langid" "lemmatizer" "long_paragraph" "long_tokens"
+ "model" "morphology" "multilingual" "mwt"
+ "pipeline_" "pretrain" "process_doc"
+ "read_snippets" "register" "reload"
+ "requirements" "resources" "retag"
+ "score" "semgrex" "serialized" "server_" "ssurgeon"
+ "tagger" "test_core" "test_one_sentence" "test_tokenizer"
+ "text_processing" "tokenize_files" "tokensregex"
+ "train_pipeline" "training" "tsurgeon")
+ " and not ") ;exclude tests requiring datasets
+ "stanza/tests")))
+ (native-inputs (list python-pytest
+ python-setuptools
+ python-transformers))
+ (propagated-inputs (list python-emoji
+ python-networkx
+ python-numpy
+ python-protobuf
+ python-pytorch
+ python-requests
+ python-tqdm))
+ (home-page "https://stanfordnlp.github.io/stanza/")
+ (synopsis "Stanford NLP Python library for many human languages")
+ (description
+ "Stanza is a collection of accurate and efficient tools
+for the linguistic analysis of many human languages. Starting from raw text,
+Stanza divides it into sentences and words, and then can recognize
+parts of speech and entities, do syntactic analysis, and more.")
+ (license license:asl2.0)))
+
(define-public onnx
(package
(name "onnx")