diff --git a/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/config.json b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/config.json new file mode 100644 index 0000000..17d7138 --- /dev/null +++ b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/config.json @@ -0,0 +1,148 @@ +{ + "model_name": "grobid-header-article-light-ref-BidLSTM_CRF_FEATURES", + "architecture": "BidLSTM_CRF_FEATURES", + "embeddings_name": "glove-840B", + "char_vocab_size": 334, + "case_vocab_size": 8, + "char_embedding_size": 25, + "num_char_lstm_units": 25, + "max_char_length": 30, + "features_vocabulary_size": 12, + "features_indices": [ + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ], + "features_embedding_size": 4, + "features_lstm_units": 4, + "max_sequence_length": 3000, + "word_embedding_size": 300, + "num_word_lstm_units": 100, + "case_embedding_size": 5, + "dropout": 0.5, + "recurrent_dropout": 0.5, + "use_crf": true, + "use_chain_crf": false, + "fold_number": 1, + "batch_size": 20, + "transformer_name": null, + "use_ELMo": false, + "features_map_to_index": { + "9": { + "BLOCKEND": 1, + "BLOCKIN": 2, + "BLOCKSTART": 3 + }, + "10": { + "LINEEND": 13, + "LINEIN": 14, + "LINESTART": 15 + }, + "11": { + "ALIGNEDLEFT": 25, + "LINEINDENT": 26 + }, + "12": { + "NEWFONT": 37, + "SAMEFONT": 38 + }, + "13": { + "HIGHERFONT": 49, + "LOWERFONT": 50, + "SAMEFONTSIZE": 51 + }, + "14": { + "0": 61, + "1": 62 + }, + "15": { + "0": 73, + "1": 74 + }, + "16": { + "ALLCAP": 85, + "INITCAP": 86, + "NOCAPS": 87 + }, + "17": { + "ALLDIGIT": 97, + "CONTAINSDIGITS": 98, + "NODIGIT": 99 + }, + "18": { + "0": 109, + "1": 110 + }, + "19": { + "0": 121, + "1": 122 + }, + "20": { + "0": 133, + "1": 134 + }, + "21": { + "0": 145, + "1": 146 + }, + "22": { + "0": 157, + "1": 158 + }, + "23": { + "0": 169, + "1": 170 + }, + "24": { + "0": 181, + "1": 182 + }, + "25": { + "0": 193, + "1": 194 + }, + "26": { + "COMMA": 205, + "DOT": 206, + "ENDBRACKET": 207, + "HYPHEN": 208, + "NOPUNCT": 209, + "OPENBRACKET": 210, + "PUNCT": 211, + "QUOTE": 212 + }, + "27": { + "0": 217, + "1": 218 + }, + "28": { + "0": 229 + }, + "29": { + "0": 241, + "1": 242 + }, + "30": { + "0": 253 + } + } +} \ No newline at end of file diff --git a/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/model_weights.hdf5 b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/model_weights.hdf5 new file mode 100644 index 0000000..ffdc76c Binary files /dev/null and b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/model_weights.hdf5 differ diff --git a/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/preprocessor.json b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/preprocessor.json new file mode 100644 index 0000000..3787bd1 --- /dev/null +++ b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_CRF_FEATURES/preprocessor.json @@ -0,0 +1,506 @@ +{ + "padding": true, + "return_lengths": true, + "return_word_embeddings": true, + "return_casing": false, + "return_features": true, + "return_chars": true, + "return_bert_embeddings": false, + "vocab_char": { + "": 0, + "": 1, + "!": 2, + "\"": 3, + "#": 4, + "$": 5, + "%": 6, + "&": 7, + "'": 8, + "(": 9, + ")": 10, + "*": 11, + "+": 12, + ",": 13, + "-": 14, + ".": 15, + "/": 16, + "0": 17, + "1": 18, + "2": 19, + "3": 20, + "4": 21, + "5": 22, + "6": 23, + "7": 24, + "8": 25, + "9": 26, + ":": 27, + ";": 28, + "<": 29, + "=": 30, + ">": 31, + "?": 32, + "@": 33, + "A": 34, + "B": 35, + "C": 36, + "D": 37, + "E": 38, + "F": 39, + "G": 40, + "H": 41, + "I": 42, + "J": 43, + "K": 44, + "L": 45, + "M": 46, + "N": 47, + "O": 48, + "P": 49, + "Q": 50, + "R": 51, + "S": 52, + "T": 53, + "U": 54, + "V": 55, + "W": 56, + "X": 57, + "Y": 58, + "Z": 59, + "[": 60, + "\\": 61, + "]": 62, + "^": 63, + "_": 64, + "`": 65, + "a": 66, + "b": 67, + "c": 68, + "d": 69, + "e": 70, + "f": 71, + "g": 72, + "h": 73, + "i": 74, + "j": 75, + "k": 76, + "l": 77, + "m": 78, + "n": 79, + "o": 80, + "p": 81, + "q": 82, + "r": 83, + "s": 84, + "t": 85, + "u": 86, + "v": 87, + "w": 88, + "x": 89, + "y": 90, + "z": 91, + "{": 92, + "|": 93, + "}": 94, + "~": 95, + "\u00a1": 96, + "\u00a2": 97, + "\u00a3": 98, + "\u00a4": 99, + "\u00a5": 100, + "\u00a7": 101, + "\u00a8": 102, + "\u00a9": 103, + "\u00aa": 104, + "\u00ab": 105, + "\u00ae": 106, + "\u00af": 107, + "\u00b0": 108, + "\u00b1": 109, + "\u00b2": 110, + "\u00b4": 111, + "\u00b5": 112, + "\u00b6": 113, + "\u00b8": 114, + "\u00bb": 115, + "\u00bc": 116, + "\u00c0": 117, + "\u00c1": 118, + "\u00c2": 119, + "\u00c3": 120, + "\u00c5": 121, + "\u00c7": 122, + "\u00c9": 123, + "\u00ca": 124, + "\u00cc": 125, + "\u00ce": 126, + "\u00d3": 127, + "\u00d5": 128, + "\u00d6": 129, + "\u00d7": 130, + "\u00d8": 131, + "\u00df": 132, + "\u00e0": 133, + "\u00e1": 134, + "\u00e2": 135, + "\u00e3": 136, + "\u00e4": 137, + "\u00e5": 138, + "\u00e7": 139, + "\u00e8": 140, + "\u00e9": 141, + "\u00ea": 142, + "\u00eb": 143, + "\u00ed": 144, + "\u00ee": 145, + "\u00ef": 146, + "\u00f1": 147, + "\u00f2": 148, + "\u00f3": 149, + "\u00f4": 150, + "\u00f5": 151, + "\u00f6": 152, + "\u00f8": 153, + "\u00fa": 154, + "\u00fb": 155, + "\u00fc": 156, + "\u00fd": 157, + "\u00fe": 158, + "\u00ff": 159, + "\u0100": 160, + "\u0103": 161, + "\u0104": 162, + "\u0107": 163, + "\u010d": 164, + "\u0117": 165, + "\u0119": 166, + "\u0130": 167, + "\u0131": 168, + "\u0141": 169, + "\u0142": 170, + "\u0144": 171, + "\u014c": 172, + "\u0159": 173, + "\u015f": 174, + "\u0160": 175, + "\u0161": 176, + "\u0179": 177, + "\u017a": 178, + "\u017c": 179, + "\u017d": 180, + "\u017e": 181, + "\u0202": 182, + "\u0288": 183, + "\u02d9": 184, + "\u0351": 185, + "\u0352": 186, + "\u0353": 187, + "\u0354": 188, + "\u0393": 189, + "\u0394": 190, + "\u0398": 191, + "\u039b": 192, + "\u039e": 193, + "\u03a0": 194, + "\u03a3": 195, + "\u03a5": 196, + "\u03a8": 197, + "\u03b1": 198, + "\u03b2": 199, + "\u03b3": 200, + "\u03b4": 201, + "\u03b5": 202, + "\u03b6": 203, + "\u03b7": 204, + "\u03b8": 205, + "\u03ba": 206, + "\u03bb": 207, + "\u03bc": 208, + "\u03bd": 209, + "\u03be": 210, + "\u03c0": 211, + "\u03c1": 212, + "\u03c3": 213, + "\u03c4": 214, + "\u03c6": 215, + "\u03c7": 216, + "\u03c8": 217, + "\u03c9": 218, + "\u03d5": 219, + "\u03e9": 220, + "\u03ea": 221, + "\u03eb": 222, + "\u03ed": 223, + "\u03f3": 224, + "\u03fd": 225, + "\u03fe": 226, + "\u0408": 227, + "\u0409": 228, + "\u0411": 229, + "\u0418": 230, + "\u041b": 231, + "\u041f": 232, + "\u0424": 233, + "\u0426": 234, + "\u0431": 235, + "\u0432": 236, + "\u0433": 237, + "\u0434": 238, + "\u0437": 239, + "\u0438": 240, + "\u0439": 241, + "\u043a": 242, + "\u043b": 243, + "\u043c": 244, + "\u043d": 245, + "\u043f": 246, + "\u0440": 247, + "\u0441": 248, + "\u0442": 249, + "\u0443": 250, + "\u0444": 251, + "\u0445": 252, + "\u0446": 253, + "\u0447": 254, + "\u0448": 255, + "\u0449": 256, + "\u044b": 257, + "\u044c": 258, + "\u044d": 259, + "\u044e": 260, + "\u044f": 261, + "\u0451": 262, + "\u0546": 263, + "\u060a": 264, + "\u060d": 265, + "\u065e": 266, + "\u0728": 267, + "\u0846": 268, + "\u0be6": 269, + "\u0be7": 270, + "\u1c14": 271, + "\u1e46": 272, + "\u2020": 273, + "\u2021": 274, + "\u2022": 275, + "\u202b": 276, + "\u202c": 277, + "\u2032": 278, + "\u2039": 279, + "\u204e": 280, + "\u2113": 281, + "\u2122": 282, + "\u2192": 283, + "\u2193": 284, + "\u21b5": 285, + "\u21d1": 286, + "\u21e4": 287, + "\u2206": 288, + "\u2208": 289, + "\u2212": 290, + "\u2213": 291, + "\u221a": 292, + "\u221e": 293, + "\u223c": 294, + "\u2243": 295, + "\u2248": 296, + "\u2264": 297, + "\u2265": 298, + "\u2299": 299, + "\u22c5": 300, + "\u22c6": 301, + "\u232c": 302, + "\u2423": 303, + "\u2424": 304, + "\u24d2": 305, + "\u25a1": 306, + "\u262f": 307, + "\u2663": 308, + "\u2666": 309, + "\u2709": 310, + "\u271d": 311, + "\u2e38": 312, + "\u318d": 313, + "\uf061": 314, + "\uf067": 315, + "\uf761": 316, + "\uf764": 317, + "\uf765": 318, + "\uf767": 319, + "\uf769": 320, + "\uf76b": 321, + "\uf76c": 322, + "\uf76e": 323, + "\uf76f": 324, + "\uf770": 325, + "\uf772": 326, + "\uf773": 327, + "\uf774": 328, + "\uf777": 329, + "\uf779": 330, + "\uf8e9": 331, + "\uff1b": 332, + "\ufffd": 333 + }, + "vocab_tag": { + "": 0, + "B-": 1, + "B-": 2, + "B-": 3, + "B-": 4, + "I-<author>": 5, + "I-<date>": 6, + "I-<pubnum>": 7, + "I-<title>": 8, + "O": 9 + }, + "vocab_case": [ + "<PAD>", + "numeric", + "allLower", + "allUpper", + "initialUpper", + "other", + "mainly_numeric", + "contains_digit" + ], + "max_char_length": 30, + "feature_preprocessor": { + "features_vocabulary_size": 12, + "features_indices": [ + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ], + "features_map_to_index": { + "9": { + "BLOCKEND": 1, + "BLOCKIN": 2, + "BLOCKSTART": 3 + }, + "10": { + "LINEEND": 13, + "LINEIN": 14, + "LINESTART": 15 + }, + "11": { + "ALIGNEDLEFT": 25, + "LINEINDENT": 26 + }, + "12": { + "NEWFONT": 37, + "SAMEFONT": 38 + }, + "13": { + "HIGHERFONT": 49, + "LOWERFONT": 50, + "SAMEFONTSIZE": 51 + }, + "14": { + "0": 61, + "1": 62 + }, + "15": { + "0": 73, + "1": 74 + }, + "16": { + "ALLCAP": 85, + "INITCAP": 86, + "NOCAPS": 87 + }, + "17": { + "ALLDIGIT": 97, + "CONTAINSDIGITS": 98, + "NODIGIT": 99 + }, + "18": { + "0": 109, + "1": 110 + }, + "19": { + "0": 121, + "1": 122 + }, + "20": { + "0": 133, + "1": 134 + }, + "21": { + "0": 145, + "1": 146 + }, + "22": { + "0": 157, + "1": 158 + }, + "23": { + "0": 169, + "1": 170 + }, + "24": { + "0": 181, + "1": 182 + }, + "25": { + "0": 193, + "1": 194 + }, + "26": { + "COMMA": 205, + "DOT": 206, + "ENDBRACKET": 207, + "HYPHEN": 208, + "NOPUNCT": 209, + "OPENBRACKET": 210, + "PUNCT": 211, + "QUOTE": 212 + }, + "27": { + "0": 217, + "1": 218 + }, + "28": { + "0": 229 + }, + "29": { + "0": 241, + "1": 242 + }, + "30": { + "0": 253 + } + } + }, + "indice_tag": { + "0": "<PAD>", + "1": "B-<author>", + "2": "B-<date>", + "3": "B-<pubnum>", + "4": "B-<title>", + "5": "I-<author>", + "6": "I-<date>", + "7": "I-<pubnum>", + "8": "I-<title>", + "9": "O" + } +} \ No newline at end of file diff --git a/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/config.json b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/config.json new file mode 100644 index 0000000..4acd6c2 --- /dev/null +++ b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/config.json @@ -0,0 +1,148 @@ +{ + "model_name": "grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES", + "architecture": "BidLSTM_ChainCRF_FEATURES", + "embeddings_name": "glove-840B", + "char_vocab_size": 334, + "case_vocab_size": 8, + "char_embedding_size": 25, + "num_char_lstm_units": 25, + "max_char_length": 30, + "features_vocabulary_size": 12, + "features_indices": [ + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ], + "features_embedding_size": 4, + "features_lstm_units": 4, + "max_sequence_length": 3000, + "word_embedding_size": 300, + "num_word_lstm_units": 100, + "case_embedding_size": 5, + "dropout": 0.5, + "recurrent_dropout": 0.5, + "use_crf": true, + "use_chain_crf": true, + "fold_number": 1, + "batch_size": 20, + "transformer_name": null, + "use_ELMo": false, + "features_map_to_index": { + "9": { + "BLOCKEND": 1, + "BLOCKIN": 2, + "BLOCKSTART": 3 + }, + "10": { + "LINEEND": 13, + "LINEIN": 14, + "LINESTART": 15 + }, + "11": { + "ALIGNEDLEFT": 25, + "LINEINDENT": 26 + }, + "12": { + "NEWFONT": 37, + "SAMEFONT": 38 + }, + "13": { + "HIGHERFONT": 49, + "LOWERFONT": 50, + "SAMEFONTSIZE": 51 + }, + "14": { + "0": 61, + "1": 62 + }, + "15": { + "0": 73, + "1": 74 + }, + "16": { + "ALLCAP": 85, + "INITCAP": 86, + "NOCAPS": 87 + }, + "17": { + "ALLDIGIT": 97, + "CONTAINSDIGITS": 98, + "NODIGIT": 99 + }, + "18": { + "0": 109, + "1": 110 + }, + "19": { + "0": 121, + "1": 122 + }, + "20": { + "0": 133, + "1": 134 + }, + "21": { + "0": 145, + "1": 146 + }, + "22": { + "0": 157, + "1": 158 + }, + "23": { + "0": 169, + "1": 170 + }, + "24": { + "0": 181, + "1": 182 + }, + "25": { + "0": 193, + "1": 194 + }, + "26": { + "COMMA": 205, + "DOT": 206, + "ENDBRACKET": 207, + "HYPHEN": 208, + "NOPUNCT": 209, + "OPENBRACKET": 210, + "PUNCT": 211, + "QUOTE": 212 + }, + "27": { + "0": 217, + "1": 218 + }, + "28": { + "0": 229 + }, + "29": { + "0": 241, + "1": 242 + }, + "30": { + "0": 253 + } + } +} \ No newline at end of file diff --git a/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/model_weights.hdf5 b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/model_weights.hdf5 new file mode 100644 index 0000000..d495ebc Binary files /dev/null and b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/model_weights.hdf5 differ diff --git a/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/preprocessor.json b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/preprocessor.json new file mode 100644 index 0000000..3787bd1 --- /dev/null +++ b/data/models/sequenceLabelling/grobid-header-article-light-ref-BidLSTM_ChainCRF_FEATURES/preprocessor.json @@ -0,0 +1,506 @@ +{ + "padding": true, + "return_lengths": true, + "return_word_embeddings": true, + "return_casing": false, + "return_features": true, + "return_chars": true, + "return_bert_embeddings": false, + "vocab_char": { + "<PAD>": 0, + "<UNK>": 1, + "!": 2, + "\"": 3, + "#": 4, + "$": 5, + "%": 6, + "&": 7, + "'": 8, + "(": 9, + ")": 10, + "*": 11, + "+": 12, + ",": 13, + "-": 14, + ".": 15, + "/": 16, + "0": 17, + "1": 18, + "2": 19, + "3": 20, + "4": 21, + "5": 22, + "6": 23, + "7": 24, + "8": 25, + "9": 26, + ":": 27, + ";": 28, + "<": 29, + "=": 30, + ">": 31, + "?": 32, + "@": 33, + "A": 34, + "B": 35, + "C": 36, + "D": 37, + "E": 38, + "F": 39, + "G": 40, + "H": 41, + "I": 42, + "J": 43, + "K": 44, + "L": 45, + "M": 46, + "N": 47, + "O": 48, + "P": 49, + "Q": 50, + "R": 51, + "S": 52, + "T": 53, + "U": 54, + "V": 55, + "W": 56, + "X": 57, + "Y": 58, + "Z": 59, + "[": 60, + "\\": 61, + "]": 62, + "^": 63, + "_": 64, + "`": 65, + "a": 66, + "b": 67, + "c": 68, + "d": 69, + "e": 70, + "f": 71, + "g": 72, + "h": 73, + "i": 74, + "j": 75, + "k": 76, + "l": 77, + "m": 78, + "n": 79, + "o": 80, + "p": 81, + "q": 82, + "r": 83, + "s": 84, + "t": 85, + "u": 86, + "v": 87, + "w": 88, + "x": 89, + "y": 90, + "z": 91, + "{": 92, + "|": 93, + "}": 94, + "~": 95, + "\u00a1": 96, + "\u00a2": 97, + "\u00a3": 98, + "\u00a4": 99, + "\u00a5": 100, + "\u00a7": 101, + "\u00a8": 102, + "\u00a9": 103, + "\u00aa": 104, + "\u00ab": 105, + "\u00ae": 106, + "\u00af": 107, + "\u00b0": 108, + "\u00b1": 109, + "\u00b2": 110, + "\u00b4": 111, + "\u00b5": 112, + "\u00b6": 113, + "\u00b8": 114, + "\u00bb": 115, + "\u00bc": 116, + "\u00c0": 117, + "\u00c1": 118, + "\u00c2": 119, + "\u00c3": 120, + "\u00c5": 121, + "\u00c7": 122, + "\u00c9": 123, + "\u00ca": 124, + "\u00cc": 125, + "\u00ce": 126, + "\u00d3": 127, + "\u00d5": 128, + "\u00d6": 129, + "\u00d7": 130, + "\u00d8": 131, + "\u00df": 132, + "\u00e0": 133, + "\u00e1": 134, + "\u00e2": 135, + "\u00e3": 136, + "\u00e4": 137, + "\u00e5": 138, + "\u00e7": 139, + "\u00e8": 140, + "\u00e9": 141, + "\u00ea": 142, + "\u00eb": 143, + "\u00ed": 144, + "\u00ee": 145, + "\u00ef": 146, + "\u00f1": 147, + "\u00f2": 148, + "\u00f3": 149, + "\u00f4": 150, + "\u00f5": 151, + "\u00f6": 152, + "\u00f8": 153, + "\u00fa": 154, + "\u00fb": 155, + "\u00fc": 156, + "\u00fd": 157, + "\u00fe": 158, + "\u00ff": 159, + "\u0100": 160, + "\u0103": 161, + "\u0104": 162, + "\u0107": 163, + "\u010d": 164, + "\u0117": 165, + "\u0119": 166, + "\u0130": 167, + "\u0131": 168, + "\u0141": 169, + "\u0142": 170, + "\u0144": 171, + "\u014c": 172, + "\u0159": 173, + "\u015f": 174, + "\u0160": 175, + "\u0161": 176, + "\u0179": 177, + "\u017a": 178, + "\u017c": 179, + "\u017d": 180, + "\u017e": 181, + "\u0202": 182, + "\u0288": 183, + "\u02d9": 184, + "\u0351": 185, + "\u0352": 186, + "\u0353": 187, + "\u0354": 188, + "\u0393": 189, + "\u0394": 190, + "\u0398": 191, + "\u039b": 192, + "\u039e": 193, + "\u03a0": 194, + "\u03a3": 195, + "\u03a5": 196, + "\u03a8": 197, + "\u03b1": 198, + "\u03b2": 199, + "\u03b3": 200, + "\u03b4": 201, + "\u03b5": 202, + "\u03b6": 203, + "\u03b7": 204, + "\u03b8": 205, + "\u03ba": 206, + "\u03bb": 207, + "\u03bc": 208, + "\u03bd": 209, + "\u03be": 210, + "\u03c0": 211, + "\u03c1": 212, + "\u03c3": 213, + "\u03c4": 214, + "\u03c6": 215, + "\u03c7": 216, + "\u03c8": 217, + "\u03c9": 218, + "\u03d5": 219, + "\u03e9": 220, + "\u03ea": 221, + "\u03eb": 222, + "\u03ed": 223, + "\u03f3": 224, + "\u03fd": 225, + "\u03fe": 226, + "\u0408": 227, + "\u0409": 228, + "\u0411": 229, + "\u0418": 230, + "\u041b": 231, + "\u041f": 232, + "\u0424": 233, + "\u0426": 234, + "\u0431": 235, + "\u0432": 236, + "\u0433": 237, + "\u0434": 238, + "\u0437": 239, + "\u0438": 240, + "\u0439": 241, + "\u043a": 242, + "\u043b": 243, + "\u043c": 244, + "\u043d": 245, + "\u043f": 246, + "\u0440": 247, + "\u0441": 248, + "\u0442": 249, + "\u0443": 250, + "\u0444": 251, + "\u0445": 252, + "\u0446": 253, + "\u0447": 254, + "\u0448": 255, + "\u0449": 256, + "\u044b": 257, + "\u044c": 258, + "\u044d": 259, + "\u044e": 260, + "\u044f": 261, + "\u0451": 262, + "\u0546": 263, + "\u060a": 264, + "\u060d": 265, + "\u065e": 266, + "\u0728": 267, + "\u0846": 268, + "\u0be6": 269, + "\u0be7": 270, + "\u1c14": 271, + "\u1e46": 272, + "\u2020": 273, + "\u2021": 274, + "\u2022": 275, + "\u202b": 276, + "\u202c": 277, + "\u2032": 278, + "\u2039": 279, + "\u204e": 280, + "\u2113": 281, + "\u2122": 282, + "\u2192": 283, + "\u2193": 284, + "\u21b5": 285, + "\u21d1": 286, + "\u21e4": 287, + "\u2206": 288, + "\u2208": 289, + "\u2212": 290, + "\u2213": 291, + "\u221a": 292, + "\u221e": 293, + "\u223c": 294, + "\u2243": 295, + "\u2248": 296, + "\u2264": 297, + "\u2265": 298, + "\u2299": 299, + "\u22c5": 300, + "\u22c6": 301, + "\u232c": 302, + "\u2423": 303, + "\u2424": 304, + "\u24d2": 305, + "\u25a1": 306, + "\u262f": 307, + "\u2663": 308, + "\u2666": 309, + "\u2709": 310, + "\u271d": 311, + "\u2e38": 312, + "\u318d": 313, + "\uf061": 314, + "\uf067": 315, + "\uf761": 316, + "\uf764": 317, + "\uf765": 318, + "\uf767": 319, + "\uf769": 320, + "\uf76b": 321, + "\uf76c": 322, + "\uf76e": 323, + "\uf76f": 324, + "\uf770": 325, + "\uf772": 326, + "\uf773": 327, + "\uf774": 328, + "\uf777": 329, + "\uf779": 330, + "\uf8e9": 331, + "\uff1b": 332, + "\ufffd": 333 + }, + "vocab_tag": { + "<PAD>": 0, + "B-<author>": 1, + "B-<date>": 2, + "B-<pubnum>": 3, + "B-<title>": 4, + "I-<author>": 5, + "I-<date>": 6, + "I-<pubnum>": 7, + "I-<title>": 8, + "O": 9 + }, + "vocab_case": [ + "<PAD>", + "numeric", + "allLower", + "allUpper", + "initialUpper", + "other", + "mainly_numeric", + "contains_digit" + ], + "max_char_length": 30, + "feature_preprocessor": { + "features_vocabulary_size": 12, + "features_indices": [ + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ], + "features_map_to_index": { + "9": { + "BLOCKEND": 1, + "BLOCKIN": 2, + "BLOCKSTART": 3 + }, + "10": { + "LINEEND": 13, + "LINEIN": 14, + "LINESTART": 15 + }, + "11": { + "ALIGNEDLEFT": 25, + "LINEINDENT": 26 + }, + "12": { + "NEWFONT": 37, + "SAMEFONT": 38 + }, + "13": { + "HIGHERFONT": 49, + "LOWERFONT": 50, + "SAMEFONTSIZE": 51 + }, + "14": { + "0": 61, + "1": 62 + }, + "15": { + "0": 73, + "1": 74 + }, + "16": { + "ALLCAP": 85, + "INITCAP": 86, + "NOCAPS": 87 + }, + "17": { + "ALLDIGIT": 97, + "CONTAINSDIGITS": 98, + "NODIGIT": 99 + }, + "18": { + "0": 109, + "1": 110 + }, + "19": { + "0": 121, + "1": 122 + }, + "20": { + "0": 133, + "1": 134 + }, + "21": { + "0": 145, + "1": 146 + }, + "22": { + "0": 157, + "1": 158 + }, + "23": { + "0": 169, + "1": 170 + }, + "24": { + "0": 181, + "1": 182 + }, + "25": { + "0": 193, + "1": 194 + }, + "26": { + "COMMA": 205, + "DOT": 206, + "ENDBRACKET": 207, + "HYPHEN": 208, + "NOPUNCT": 209, + "OPENBRACKET": 210, + "PUNCT": 211, + "QUOTE": 212 + }, + "27": { + "0": 217, + "1": 218 + }, + "28": { + "0": 229 + }, + "29": { + "0": 241, + "1": 242 + }, + "30": { + "0": 253 + } + } + }, + "indice_tag": { + "0": "<PAD>", + "1": "B-<author>", + "2": "B-<date>", + "3": "B-<pubnum>", + "4": "B-<title>", + "5": "I-<author>", + "6": "I-<date>", + "7": "I-<pubnum>", + "8": "I-<title>", + "9": "O" + } +} \ No newline at end of file diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py index cdc20a8..7dcc3c9 100644 --- a/delft/sequenceLabelling/trainer.py +++ b/delft/sequenceLabelling/trainer.py @@ -79,14 +79,12 @@ def compile_model(self, local_model, train_size): if local_model.config.use_chain_crf: local_model.compile( optimizer=optimizer, - loss=local_model.crf.sparse_crf_loss_bert_masked, - metrics = ["accuracy"] if self.enable_wandb else [] + loss=local_model.crf.sparse_crf_loss_bert_masked ) elif local_model.config.use_crf: # loss is calculated by the custom CRF wrapper local_model.compile( optimizer=optimizer, - metrics = ["accuracy"] if self.enable_wandb else [] ) else: # we apply a mask on the predicted labels so that the weights @@ -94,7 +92,6 @@ def compile_model(self, local_model, train_size): local_model.compile( optimizer=optimizer, loss=sparse_crossentropy_masked, - metrics=["accuracy"] if self.enable_wandb else [] ) else: @@ -109,14 +106,12 @@ def compile_model(self, local_model, train_size): local_model.compile( optimizer=optimizer, loss=local_model.crf.loss, - metrics = ["accuracy"] if self.enable_wandb else [] ) elif local_model.config.use_crf: if tf.executing_eagerly(): # loss is calculated by the custom CRF wrapper, no need to specify a loss function here local_model.compile( optimizer=optimizer, - metrics = ["accuracy"] if self.enable_wandb else [] ) else: print("compile model, graph mode") @@ -128,7 +123,6 @@ def compile_model(self, local_model, train_size): local_model.compile( optimizer=optimizer, loss='sparse_categorical_crossentropy', - metrics = ["accuracy"] if self.enable_wandb else [] ) #local_model.compile(optimizer=optimizer, loss=InnerLossPusher(local_model)) else: @@ -136,7 +130,6 @@ def compile_model(self, local_model, train_size): local_model.compile( optimizer=optimizer, loss='sparse_categorical_crossentropy', - metrics = ["accuracy"] if self.enable_wandb else [] ) return local_model diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index 61608a5..d6bbb04 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -108,6 +108,8 @@ def __init__(self, word_emb_size = 0 self.embeddings = None self.model_local_path = None + + self.wandb_config = None if wandb_config is not None: if 'project' in wandb_config: self.wandb_config = wandb_config