Skip to content

Commit

Permalink
remove udpipe_accuracy for the time being
Browse files Browse the repository at this point in the history
  • Loading branch information
jwijffels committed Dec 6, 2017
1 parent dfbcb27 commit 44c7ae0
Show file tree
Hide file tree
Showing 9 changed files with 7 additions and 182 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ export(txt_previous)
export(txt_recode)
export(txt_sample)
export(txt_show)
export(udpipe_accuracy)
export(udpipe_annotate)
export(udpipe_download_model)
export(udpipe_load_model)
Expand Down
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# CHANGES IN udpipe VERSION 0.2.1

- Added phrases to extract POS sequences more easily like noun phrases, verb phrases or any sequence of parts of speech tags and their corresponding words
- Add udpipe_accuracy
- Fix issue in txt_nextgram if n was larger than the number of elements in x
- Fix heap-use-after-free address sanitiser issue
- Fix runtime error: null pointer passed as argument 1, which is declared to never be null (e.g. udpipe.cpp: 3338)
- Another stab at the Solaris compilation issue


# CHANGES IN udpipe VERSION 0.2
Expand Down
4 changes: 0 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ udp_tokenise_tag_parse <- function(udmodel, x, docid, annotation_tokenizer, anno
.Call('_udpipe_udp_tokenise_tag_parse', PACKAGE = 'udpipe', udmodel, x, docid, annotation_tokenizer, annotation_tagger, annotation_parser)
}

udp_evaluate <- function(udmodel, conllu_test_file, output_file, annotation_tokenizer, annotation_tagger, annotation_parser) {
.Call('_udpipe_udp_evaluate', PACKAGE = 'udpipe', udmodel, conllu_test_file, output_file, annotation_tokenizer, annotation_tagger, annotation_parser)
}

na_locf <- function(x) {
.Call('_udpipe_na_locf', PACKAGE = 'udpipe', x)
}
Expand Down
58 changes: 1 addition & 57 deletions R/udpipe_train.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#' \item{errors: }{Messages from the UDPipe process indicating possible errors for example when passing the wrong arguments to the
#' annotation_tokenizer, annotation_tagger or annotation_parser}
#' }
#' @seealso \code{\link{udpipe_annotation_params}}, \code{\link{udpipe_annotate}}, \code{\link{udpipe_load_model}}, \code{\link{udpipe_accuracy}}
#' @seealso \code{\link{udpipe_annotation_params}}, \code{\link{udpipe_annotate}}, \code{\link{udpipe_load_model}}
#' @references \url{http://ufal.mff.cuni.cz/udpipe/users-manual}
#' @details
#' In order to train a model, you need to provide files which are in CONLL-U format in argument \code{files_conllu_training}.
Expand Down Expand Up @@ -133,59 +133,3 @@ udpipe_train <- function(file = file.path(getwd(), "my_annotator.udpipe"),
class = "udpipe_trained_model"))
}




#' @title Evaluate the accuracy of your UDPipe model on holdout data
#' @description Get precision, recall and F1 measures on finding words, sentences,
#' parts of speech tags (upos and xpos) and the grammatical annotated features.
#' For the accuracy of the dependency parsing, the output shows
#' the LAS (labeled attachment score - the percentage of predicted dependencies
#' where the arc and the label are assigned correctly) and
#' UAS (unlabeled attachment score – where the arc is assigned correctly)
#' dependency scores on holdout data in conllu format.
#' @param object an object of class \code{udpipe_model} as returned by \code{\link{udpipe_load_model}}
#' @param file_conllu the full path to a file on disk containing holdout data in conllu format
#' @param tokenizer a character string of length 1, which is either 'default' or 'none'
#' @param tagger a character string of length 1, which is either 'default' or 'none'
#' @param parser a character string of length 1, which is either 'default' or 'none'
#' @return a list with 3 elements
#' \itemize{
#' \item{accuracy: }{A character vector with accuracy metrics.}
#' \item{error: }{A character string with possible errors when calculating the accuracy metrics}
#' }
#' @seealso \code{\link{udpipe_load_model}}
#' @references \url{https://ufal.mff.cuni.cz/udpipe},
#' \url{http://universaldependencies.org/format.html}
#' @export
#' @examples
#' x <- udpipe_download_model(language = "dutch-lassysmall")
#' ud_dutch <- udpipe_load_model(x$file_model)
#'
#' file_conllu <- system.file(package = "udpipe", "dummydata", "traindata.conllu")
#' metrics <- udpipe_accuracy(ud_dutch, file_conllu,
#' tokenizer = "default", tagger = "none", parser = "none")
#' metrics$accuracy
#'
#' ## cleanup for CRAN only - you probably want to keep your model if you have downloaded it
#' file.remove("dutch-lassysmall-ud-2.0-170801.udpipe")
udpipe_accuracy <- function(object,
file_conllu,
tokenizer = c("default", "none"),
tagger = c("default", "none"),
parser = c("default", "none")) {
if(!inherits(object, "udpipe_model")){
stop("object should be of class udpipe_model as returned by the function ?udpipe_load_model")
}
stopifnot(file.exists(file_conllu))
tokenizer <- match.arg(tokenizer)
tagger <- match.arg(tagger)
parser <- match.arg(parser)

f <- tempfile()
out <- udp_evaluate(object$model, file_conllu, f, tokenizer, tagger, parser)
out$accuracy <- readLines(f)
class(out) <- "udpipe_accuracy"
out
}

55 changes: 0 additions & 55 deletions man/udpipe_accuracy.Rd

This file was deleted.

2 changes: 1 addition & 1 deletion man/udpipe_train.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 0 additions & 17 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,6 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// udp_evaluate
Rcpp::List udp_evaluate(SEXP udmodel, Rcpp::CharacterVector conllu_test_file, Rcpp::CharacterVector output_file, std::string annotation_tokenizer, std::string annotation_tagger, std::string annotation_parser);
RcppExport SEXP _udpipe_udp_evaluate(SEXP udmodelSEXP, SEXP conllu_test_fileSEXP, SEXP output_fileSEXP, SEXP annotation_tokenizerSEXP, SEXP annotation_taggerSEXP, SEXP annotation_parserSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type udmodel(udmodelSEXP);
Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type conllu_test_file(conllu_test_fileSEXP);
Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type output_file(output_fileSEXP);
Rcpp::traits::input_parameter< std::string >::type annotation_tokenizer(annotation_tokenizerSEXP);
Rcpp::traits::input_parameter< std::string >::type annotation_tagger(annotation_taggerSEXP);
Rcpp::traits::input_parameter< std::string >::type annotation_parser(annotation_parserSEXP);
rcpp_result_gen = Rcpp::wrap(udp_evaluate(udmodel, conllu_test_file, output_file, annotation_tokenizer, annotation_tagger, annotation_parser));
return rcpp_result_gen;
END_RCPP
}
// na_locf
Rcpp::CharacterVector na_locf(Rcpp::CharacterVector x);
RcppExport SEXP _udpipe_na_locf(SEXP xSEXP) {
Expand Down Expand Up @@ -93,7 +77,6 @@ static const R_CallMethodDef CallEntries[] = {
{"_udpipe_phrases_regex_locate", (DL_FUNC) &_udpipe_phrases_regex_locate, 3},
{"_udpipe_udp_load_model", (DL_FUNC) &_udpipe_udp_load_model, 1},
{"_udpipe_udp_tokenise_tag_parse", (DL_FUNC) &_udpipe_udp_tokenise_tag_parse, 6},
{"_udpipe_udp_evaluate", (DL_FUNC) &_udpipe_udp_evaluate, 6},
{"_udpipe_na_locf", (DL_FUNC) &_udpipe_na_locf, 1},
{"_udpipe_udp_train", (DL_FUNC) &_udpipe_udp_train, 6},
{NULL, NULL, 0}
Expand Down
44 changes: 0 additions & 44 deletions src/rcpp_udpipe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,50 +71,6 @@ Rcpp::List udp_tokenise_tag_parse(SEXP udmodel, Rcpp::StringVector x, Rcpp::Stri



// [[Rcpp::export]]
Rcpp::List udp_evaluate(SEXP udmodel,
Rcpp::CharacterVector conllu_test_file,
Rcpp::CharacterVector output_file,
std::string annotation_tokenizer,
std::string annotation_tagger,
std::string annotation_parser) {
Rcpp::XPtr<ufal::udpipe::model> languagemodel(udmodel);

// Handle default and none input to tokenizer, tagger, parser
std::string pipeline_tokenizer = annotation_tokenizer;
std::string pipeline_tagger = annotation_tagger;
std::string pipeline_parser = annotation_parser;
if (pipeline_tagger.compare("none") == 0){
pipeline_tagger = ufal::udpipe::pipeline::NONE;
}else if (pipeline_tagger.compare("default") == 0){
pipeline_tagger = ufal::udpipe::pipeline::DEFAULT;
}
if (pipeline_parser.compare("none") == 0){
pipeline_parser = ufal::udpipe::pipeline::NONE;
}else if (pipeline_parser.compare("default") == 0){
pipeline_parser = ufal::udpipe::pipeline::DEFAULT;
}

// Set up evaluator
ufal::udpipe::evaluator modelevaluator = ufal::udpipe::evaluator(languagemodel, pipeline_tokenizer, pipeline_tagger, pipeline_parser);

// Input CONLLU filestream and output file containing the evaluation
std::string path;
path = conllu_test_file[0];
std::ifstream infile(path.c_str());
path = output_file[0];
std::ofstream outfile(path.c_str());

// Evaluate the model
std::string error;
modelevaluator.evaluate(infile, outfile, error);

// Return the file and the error
Rcpp::List output = Rcpp::List::create(Rcpp::Named("error") = error);
return output;
}


// [[Rcpp::export]]
Rcpp::CharacterVector na_locf(Rcpp::CharacterVector x) {
int i;
Expand Down
6 changes: 4 additions & 2 deletions src/udpipe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19767,8 +19767,10 @@ void multiword_splitter::append_token(string_piece token, string_piece misc, sen
}

// Determine casing
enum casing_t { UC_FIRST, UC_ALL, OTHER };
casing_t casing = OTHER;
int UC_FIRST=0, UC_ALL=1, OTHER=2;
int casing=OTHER;
//enum casing_t { UC_FIRST, UC_ALL, OTHER };
//casing_t casing = OTHER;
//enum { UC_FIRST, UC_ALL, OTHER } casing = OTHER;

if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) {
Expand Down

0 comments on commit 44c7ae0

Please sign in to comment.