-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_mxm.erl
88 lines (73 loc) · 3.01 KB
/
read_mxm.erl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
%%%-------------------------------------------------------------------
%%% @author Ken Friis Larsen <kflarsen@diku.dk>
%%% @copyright (C) 2011, Ken Friis Larsen
%%% @doc
%%%
%%% Functions for parsing the bag-of-words datasets from the
%%% [http://labrosa.ee.columbia.edu/millionsong/musixmatch musiXmatch dataset] (MXM).
%%%
%%% @end
%%% Created : Oct 2011 by Ken Friis Larsen <kflarsen@diku.dk>
%%%-------------------------------------------------------------------
-module(read_mxm).
%% API
-export([from_file/1,parse_track/1]).
%%%===================================================================
%%% API
%%%===================================================================
%%--------------------------------------------------------------------
%% @doc Read in a bag-of-words dataset from the MXM project.
%%
%% Returns a pair where the first component is a list of words, and
%% the second component is a list of binaries, one for each track. Use
%% `parse_track' to parse a binary track.
%%
%% @spec from_file(FileName :: string()) -> {[string()], [binary()]}
%%
%% @end
%%--------------------------------------------------------------------
from_file(FileName) ->
{ok, Bin} = file:read_file(FileName),
BLines = binary:split(Bin, <<$\n>>, [global,trim]),
[WLine | Tracks] = lists:filter(fun (<<$#, _/binary>>) -> false;
(_) -> true end,
BLines),
Words = parse_words_line(WLine),
{Words,Tracks}.
%%--------------------------------------------------------------------
%% @doc Parse a track from a binary on the form:
%% `track_id, mxm_track_id, <word idx>:<cnt>, <word idx>:<cnt>, ...'
%%
%% Return a tuple where the first component is track_id, second is
%% mxm_track_id, and the third is a list of pairs: word index and
%% count. Remember word index starts at 1 (not zero). See comment in
%% the start of the bag-of-words dataset for more detailed information.
%%
%% @spec parse_track(Track :: binary()) -> {binary(),binary(),[{integer(),integer()}]}
%%
%% @end
%%--------------------------------------------------------------------
parse_track(Track) ->
[TrackId,MxmId| Rest] = binary:split(Track, <<$,>>, [global,trim]),
{TrackId,MxmId,lists:map(fun parse_count/1, Rest)}.
%%%===================================================================
%%% Internal functions
%%%===================================================================
parse_words_line(<<$%,WLine/binary>>) ->
lists:map(fun binary_to_list/1, binary:split(WLine, <<$,>>, [global,trim])).
parse_count(CBin) ->
[Widx,Cnt] = binary:split(CBin,<<$:>>),
{bin_to_int(Widx), bin_to_int(Cnt)}.
bin_to_int(Bin) ->
list_to_integer(binary_to_list(Bin)).
bsplit(Bin, Char) ->
bsplit(Char, Bin, 0, []).
bsplit(Char, Bin, Idx, Acc) ->
case Bin of
<<This:Idx/binary, Char, Tail/binary>> ->
bsplit(Char, Tail, 0, [This|Acc]);
<<_This:Idx/binary, _, _Tail/binary>> ->
bsplit(Char, Bin, Idx+1, Acc);
<<This:Idx/binary>> ->
lists:reverse(Acc, [This || Idx > 0])
end.