From 44c7ae0492b26431890102d36e36378674a37df5 Mon Sep 17 00:00:00 2001 From: Jan Wijffels Date: Wed, 6 Dec 2017 23:34:46 +0100 Subject: [PATCH] remove udpipe_accuracy for the time being --- NAMESPACE | 1 - NEWS.md | 2 +- R/RcppExports.R | 4 --- R/udpipe_train.R | 58 +----------------------------------------- man/udpipe_accuracy.Rd | 55 --------------------------------------- man/udpipe_train.Rd | 2 +- src/RcppExports.cpp | 17 ------------- src/rcpp_udpipe.cpp | 44 -------------------------------- src/udpipe.cpp | 6 +++-- 9 files changed, 7 insertions(+), 182 deletions(-) delete mode 100644 man/udpipe_accuracy.Rd diff --git a/NAMESPACE b/NAMESPACE index 6ff2c7a..69c94b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,7 +33,6 @@ export(txt_previous) export(txt_recode) export(txt_sample) export(txt_show) -export(udpipe_accuracy) export(udpipe_annotate) export(udpipe_download_model) export(udpipe_load_model) diff --git a/NEWS.md b/NEWS.md index a8c0cf6..5ad8bab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,10 @@ # CHANGES IN udpipe VERSION 0.2.1 - Added phrases to extract POS sequences more easily like noun phrases, verb phrases or any sequence of parts of speech tags and their corresponding words -- Add udpipe_accuracy - Fix issue in txt_nextgram if n was larger than the number of elements in x - Fix heap-use-after-free address sanitiser issue - Fix runtime error: null pointer passed as argument 1, which is declared to never be null (e.g. udpipe.cpp: 3338) +- Another stab at the Solaris compilation issue # CHANGES IN udpipe VERSION 0.2 diff --git a/R/RcppExports.R b/R/RcppExports.R index e71ab4e..14fc0ed 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,10 +13,6 @@ udp_tokenise_tag_parse <- function(udmodel, x, docid, annotation_tokenizer, anno .Call('_udpipe_udp_tokenise_tag_parse', PACKAGE = 'udpipe', udmodel, x, docid, annotation_tokenizer, annotation_tagger, annotation_parser) } -udp_evaluate <- function(udmodel, conllu_test_file, output_file, annotation_tokenizer, annotation_tagger, annotation_parser) { - .Call('_udpipe_udp_evaluate', PACKAGE = 'udpipe', udmodel, conllu_test_file, output_file, annotation_tokenizer, annotation_tagger, annotation_parser) -} - na_locf <- function(x) { .Call('_udpipe_na_locf', PACKAGE = 'udpipe', x) } diff --git a/R/udpipe_train.R b/R/udpipe_train.R index 512b8dc..8fa27d4 100644 --- a/R/udpipe_train.R +++ b/R/udpipe_train.R @@ -38,7 +38,7 @@ #' \item{errors: }{Messages from the UDPipe process indicating possible errors for example when passing the wrong arguments to the #' annotation_tokenizer, annotation_tagger or annotation_parser} #' } -#' @seealso \code{\link{udpipe_annotation_params}}, \code{\link{udpipe_annotate}}, \code{\link{udpipe_load_model}}, \code{\link{udpipe_accuracy}} +#' @seealso \code{\link{udpipe_annotation_params}}, \code{\link{udpipe_annotate}}, \code{\link{udpipe_load_model}} #' @references \url{http://ufal.mff.cuni.cz/udpipe/users-manual} #' @details #' In order to train a model, you need to provide files which are in CONLL-U format in argument \code{files_conllu_training}. @@ -133,59 +133,3 @@ udpipe_train <- function(file = file.path(getwd(), "my_annotator.udpipe"), class = "udpipe_trained_model")) } - - - -#' @title Evaluate the accuracy of your UDPipe model on holdout data -#' @description Get precision, recall and F1 measures on finding words, sentences, -#' parts of speech tags (upos and xpos) and the grammatical annotated features. -#' For the accuracy of the dependency parsing, the output shows -#' the LAS (labeled attachment score - the percentage of predicted dependencies -#' where the arc and the label are assigned correctly) and -#' UAS (unlabeled attachment score – where the arc is assigned correctly) -#' dependency scores on holdout data in conllu format. -#' @param object an object of class \code{udpipe_model} as returned by \code{\link{udpipe_load_model}} -#' @param file_conllu the full path to a file on disk containing holdout data in conllu format -#' @param tokenizer a character string of length 1, which is either 'default' or 'none' -#' @param tagger a character string of length 1, which is either 'default' or 'none' -#' @param parser a character string of length 1, which is either 'default' or 'none' -#' @return a list with 3 elements -#' \itemize{ -#' \item{accuracy: }{A character vector with accuracy metrics.} -#' \item{error: }{A character string with possible errors when calculating the accuracy metrics} -#' } -#' @seealso \code{\link{udpipe_load_model}} -#' @references \url{https://ufal.mff.cuni.cz/udpipe}, -#' \url{http://universaldependencies.org/format.html} -#' @export -#' @examples -#' x <- udpipe_download_model(language = "dutch-lassysmall") -#' ud_dutch <- udpipe_load_model(x$file_model) -#' -#' file_conllu <- system.file(package = "udpipe", "dummydata", "traindata.conllu") -#' metrics <- udpipe_accuracy(ud_dutch, file_conllu, -#' tokenizer = "default", tagger = "none", parser = "none") -#' metrics$accuracy -#' -#' ## cleanup for CRAN only - you probably want to keep your model if you have downloaded it -#' file.remove("dutch-lassysmall-ud-2.0-170801.udpipe") -udpipe_accuracy <- function(object, - file_conllu, - tokenizer = c("default", "none"), - tagger = c("default", "none"), - parser = c("default", "none")) { - if(!inherits(object, "udpipe_model")){ - stop("object should be of class udpipe_model as returned by the function ?udpipe_load_model") - } - stopifnot(file.exists(file_conllu)) - tokenizer <- match.arg(tokenizer) - tagger <- match.arg(tagger) - parser <- match.arg(parser) - - f <- tempfile() - out <- udp_evaluate(object$model, file_conllu, f, tokenizer, tagger, parser) - out$accuracy <- readLines(f) - class(out) <- "udpipe_accuracy" - out -} - diff --git a/man/udpipe_accuracy.Rd b/man/udpipe_accuracy.Rd deleted file mode 100644 index 6ce7ef7..0000000 --- a/man/udpipe_accuracy.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/udpipe_train.R -\name{udpipe_accuracy} -\alias{udpipe_accuracy} -\title{Evaluate the accuracy of your UDPipe model on holdout data} -\usage{ -udpipe_accuracy(object, file_conllu, tokenizer = c("default", "none"), - tagger = c("default", "none"), parser = c("default", "none")) -} -\arguments{ -\item{object}{an object of class \code{udpipe_model} as returned by \code{\link{udpipe_load_model}}} - -\item{file_conllu}{the full path to a file on disk containing holdout data in conllu format} - -\item{tokenizer}{a character string of length 1, which is either 'default' or 'none'} - -\item{tagger}{a character string of length 1, which is either 'default' or 'none'} - -\item{parser}{a character string of length 1, which is either 'default' or 'none'} -} -\value{ -a list with 3 elements -\itemize{ - \item{accuracy: }{A character vector with accuracy metrics.} - \item{error: }{A character string with possible errors when calculating the accuracy metrics} -} -} -\description{ -Get precision, recall and F1 measures on finding words, sentences, -parts of speech tags (upos and xpos) and the grammatical annotated features. -For the accuracy of the dependency parsing, the output shows -the LAS (labeled attachment score - the percentage of predicted dependencies -where the arc and the label are assigned correctly) and -UAS (unlabeled attachment score – where the arc is assigned correctly) -dependency scores on holdout data in conllu format. -} -\examples{ -x <- udpipe_download_model(language = "dutch-lassysmall") -ud_dutch <- udpipe_load_model(x$file_model) - -file_conllu <- system.file(package = "udpipe", "dummydata", "traindata.conllu") -metrics <- udpipe_accuracy(ud_dutch, file_conllu, - tokenizer = "default", tagger = "none", parser = "none") -metrics$accuracy - -## cleanup for CRAN only - you probably want to keep your model if you have downloaded it -file.remove("dutch-lassysmall-ud-2.0-170801.udpipe") -} -\references{ -\url{https://ufal.mff.cuni.cz/udpipe}, -\url{http://universaldependencies.org/format.html} -} -\seealso{ -\code{\link{udpipe_load_model}} -} diff --git a/man/udpipe_train.Rd b/man/udpipe_train.Rd index d2bbbf5..d4f1f24 100644 --- a/man/udpipe_train.Rd +++ b/man/udpipe_train.Rd @@ -120,5 +120,5 @@ vignette("udpipe-train", package = "udpipe") \url{http://ufal.mff.cuni.cz/udpipe/users-manual} } \seealso{ -\code{\link{udpipe_annotation_params}}, \code{\link{udpipe_annotate}}, \code{\link{udpipe_load_model}}, \code{\link{udpipe_accuracy}} +\code{\link{udpipe_annotation_params}}, \code{\link{udpipe_annotate}}, \code{\link{udpipe_load_model}} } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 3d9b251..55cde0d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -45,22 +45,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// udp_evaluate -Rcpp::List udp_evaluate(SEXP udmodel, Rcpp::CharacterVector conllu_test_file, Rcpp::CharacterVector output_file, std::string annotation_tokenizer, std::string annotation_tagger, std::string annotation_parser); -RcppExport SEXP _udpipe_udp_evaluate(SEXP udmodelSEXP, SEXP conllu_test_fileSEXP, SEXP output_fileSEXP, SEXP annotation_tokenizerSEXP, SEXP annotation_taggerSEXP, SEXP annotation_parserSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type udmodel(udmodelSEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type conllu_test_file(conllu_test_fileSEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type output_file(output_fileSEXP); - Rcpp::traits::input_parameter< std::string >::type annotation_tokenizer(annotation_tokenizerSEXP); - Rcpp::traits::input_parameter< std::string >::type annotation_tagger(annotation_taggerSEXP); - Rcpp::traits::input_parameter< std::string >::type annotation_parser(annotation_parserSEXP); - rcpp_result_gen = Rcpp::wrap(udp_evaluate(udmodel, conllu_test_file, output_file, annotation_tokenizer, annotation_tagger, annotation_parser)); - return rcpp_result_gen; -END_RCPP -} // na_locf Rcpp::CharacterVector na_locf(Rcpp::CharacterVector x); RcppExport SEXP _udpipe_na_locf(SEXP xSEXP) { @@ -93,7 +77,6 @@ static const R_CallMethodDef CallEntries[] = { {"_udpipe_phrases_regex_locate", (DL_FUNC) &_udpipe_phrases_regex_locate, 3}, {"_udpipe_udp_load_model", (DL_FUNC) &_udpipe_udp_load_model, 1}, {"_udpipe_udp_tokenise_tag_parse", (DL_FUNC) &_udpipe_udp_tokenise_tag_parse, 6}, - {"_udpipe_udp_evaluate", (DL_FUNC) &_udpipe_udp_evaluate, 6}, {"_udpipe_na_locf", (DL_FUNC) &_udpipe_na_locf, 1}, {"_udpipe_udp_train", (DL_FUNC) &_udpipe_udp_train, 6}, {NULL, NULL, 0} diff --git a/src/rcpp_udpipe.cpp b/src/rcpp_udpipe.cpp index 7d30ccd..169c102 100644 --- a/src/rcpp_udpipe.cpp +++ b/src/rcpp_udpipe.cpp @@ -71,50 +71,6 @@ Rcpp::List udp_tokenise_tag_parse(SEXP udmodel, Rcpp::StringVector x, Rcpp::Stri -// [[Rcpp::export]] -Rcpp::List udp_evaluate(SEXP udmodel, - Rcpp::CharacterVector conllu_test_file, - Rcpp::CharacterVector output_file, - std::string annotation_tokenizer, - std::string annotation_tagger, - std::string annotation_parser) { - Rcpp::XPtr languagemodel(udmodel); - - // Handle default and none input to tokenizer, tagger, parser - std::string pipeline_tokenizer = annotation_tokenizer; - std::string pipeline_tagger = annotation_tagger; - std::string pipeline_parser = annotation_parser; - if (pipeline_tagger.compare("none") == 0){ - pipeline_tagger = ufal::udpipe::pipeline::NONE; - }else if (pipeline_tagger.compare("default") == 0){ - pipeline_tagger = ufal::udpipe::pipeline::DEFAULT; - } - if (pipeline_parser.compare("none") == 0){ - pipeline_parser = ufal::udpipe::pipeline::NONE; - }else if (pipeline_parser.compare("default") == 0){ - pipeline_parser = ufal::udpipe::pipeline::DEFAULT; - } - - // Set up evaluator - ufal::udpipe::evaluator modelevaluator = ufal::udpipe::evaluator(languagemodel, pipeline_tokenizer, pipeline_tagger, pipeline_parser); - - // Input CONLLU filestream and output file containing the evaluation - std::string path; - path = conllu_test_file[0]; - std::ifstream infile(path.c_str()); - path = output_file[0]; - std::ofstream outfile(path.c_str()); - - // Evaluate the model - std::string error; - modelevaluator.evaluate(infile, outfile, error); - - // Return the file and the error - Rcpp::List output = Rcpp::List::create(Rcpp::Named("error") = error); - return output; -} - - // [[Rcpp::export]] Rcpp::CharacterVector na_locf(Rcpp::CharacterVector x) { int i; diff --git a/src/udpipe.cpp b/src/udpipe.cpp index 3cfb41f..4e1726a 100644 --- a/src/udpipe.cpp +++ b/src/udpipe.cpp @@ -19767,8 +19767,10 @@ void multiword_splitter::append_token(string_piece token, string_piece misc, sen } // Determine casing - enum casing_t { UC_FIRST, UC_ALL, OTHER }; - casing_t casing = OTHER; + int UC_FIRST=0, UC_ALL=1, OTHER=2; + int casing=OTHER; + //enum casing_t { UC_FIRST, UC_ALL, OTHER }; + //casing_t casing = OTHER; //enum { UC_FIRST, UC_ALL, OTHER } casing = OTHER; if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) {