-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmmix-config.w
1041 lines (913 loc) · 39 KB
/
mmix-config.w
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% This file is part of the MMIXware package (c) Donald E Knuth 1999
@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
\def\title{MMIX-CONFIG}
\def\MMIX{\.{MMIX}}
\def\Hex#1{\hbox{$^{\scriptscriptstyle\#}$\tt#1}} % experimental hex constant
@s bool int
@s cache int
@s func int
@s coroutine int
@s octa int
@s cacheset int
@s cacheblock int
@s fetch int
@s control int
@s write_node int
@s internal_opcode int
@s replace_policy int
@s PV TeX
@s mmix_opcode int
@s specnode int
\def\PV{\\{PV}} % use italics, not \tt
@s CPV TeX
\def\CPV{\\{CPV}}
@s OP TeX
\def\OP{\\{OP}}
@s and normal @q unreserve a C++ keyword @>
@s or normal @q unreserve a C++ keyword @>
@s xor normal @q unreserve a C++ keyword @>
@*Input format. Configuration files allow this simulator to adapt itself to
infinitely many possible combinations of hardware features. The purpose of the
present module is to read a configuration file, check it for validity, and
set up the relevant data structures.
All data in a configuration file consists simply of {\it tokens\/} separated
by one or more units of white space, where a ``token'' is any sequence of
nonspace characters that doesn't contain a percent sign. Percent signs
and anything following them on a line are ignored; this convention allows
a user to include comments in the file. Here's a simple (but weird) example:
$$\vbox{\halign{\tt#\hfil\cr
\% Silly configuration\cr
writebuffer 200\cr
memaddresstime 100\cr
Dcache associativity 4 lru\cr
Dcache blocksize 1024\cr
unit ODD 5555555555555555555555555555555555555555555555555555555555555555\cr
unit EVEN aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\cr
div 40 30 20\ \ \% three-stage divide\cr
}}$$
It means that (1) the write buffer has capacity for 200 octabytes;
(2)~the memory bus takes 100 cycles to process an address;
(3)~there's a D-cache, in which each set has 4 blocks and the replacement
policy is least-recently-used;
(4)~each block in the D-cache has 1024 bytes;
(5)~there are two functional units, one for all the odd-numbered opcodes
and one for all the rest;
(6)~the division instructions take three pipeline stages, spending 40 cycles
in the first stage, 30~in the second, and 20 in the last;
(7)~all other parameters have default values.
@ Four kinds of specifications can appear in a configuration file,
according to the following syntax:
\def\<#1>{\hbox{$\langle\,$#1$\,\rangle$}}\let\is=\longrightarrow
$$\vbox{\halign{$#$\hfil\cr
\<specification>\is\<PV spec>\mid\<cache spec>\mid\<pipe spec>\mid
\<functional spec>\cr
\<PV spec>\is\<parameter>\<decimal value>\cr
\<cache spec>\is\<cache name>\<cache parameter>\<decimal value>\<policy>\cr
\<pipe spec>\is\<operation>\<pipeline times>\cr
\<functional spec>\is\.{unit}\ \<name>\<64 hexadecimal digits>\cr}}$$
@ A \<PV spec> simply assigns a given value to a given parameter. The
possibilities for \<parameter> are as follows:
\def\bull#1 {\smallskip\hang\textindent{$\bullet$}\.{#1}\enspace}
\bull fetchbuffer (default 4), maximum instructions in the fetch buffer;
must be $\ge1$.
\bull writebuffer (default 2), maximum octabytes in the write buffer;
must be $\ge1$.
\bull reorderbuffer (default 5), maximum instructions issued but not
committed; must be $\ge1$.
\bull renameregs (default 5), maximum partial results in the reorder
buffer; must be $\ge1$.
\bull memslots (default 2), maximum store instructions in the reorder
buffer; must be $\ge1$.
\bull localregs (default 256), number of local registers in ring;
must be 256, 512, or 1024.
\bull fetchmax (default 2), maximum instructions fetched per cycle;
must be $\ge1$.
\bull dispatchmax (default 1), maximum instructions issued per cycle;
must be $\ge1$.
\bull peekahead (default 1), maximum lookahead for jumps per cycle.
\bull commitmax (default 1), maximum instructions committed per cycle;
must be $\ge1$.
\bull fremmax (default 1), maximum reductions in \.{FREM} computation per
cycle; must be $\ge1$.
\bull denin (default 1), extra cycles taken if a floating point input
is subnormal.
\bull denout (default 1), extra cycles taken if a floating point result
is subnormal.
\bull writeholdingtime (default 0), minimum number of cycles for data to
remain in the write buffer.
\bull memaddresstime (default 20), cycles to process memory address;
must be $\ge1$.
\bull memreadtime (default 20), cycles to read one memory busload;
must be $\ge1$.
\bull memwritetime (default 20), cycles to write one memory busload;
must be $\ge1$.
\bull membusbytes (default 8), number of bytes per memory busload; must be a
power of~2 that is 8~or~more.
\bull branchpredictbits (default 0), number of bits in each branch prediction
table entry; must be $\le8$.
\bull branchaddressbits (default 0), number of bits in instruction address
used to index the branch prediction table.
\bull branchhistorybits (default 0), number of bits in branch history used to
index the branch prediction table.
\bull branchdualbits (default 0), number of bits of
instruction-address-xor-branch-history used to index the branch prediction
table.
\bull hardwarepagetable (default 1), is zero if page table calculations
must be emulated by the operating system.
\bull disablesecurity (default 0), is 1 if the hot-seat security checks
are turned off. This option is used only for testing purposes; it means
that the `\.s' interrupt will not occur, and the `\.p' interrupt will
be signaled only when going from a nonnegative location to a negative one.
\bull memchunksmax (default 1000), maximum number of $2^{16}$-byte chunks of
simulated memory; must be $\ge1$.
\bull hashprime (default 2003), prime number used to address simulated memory;
must exceed \.{memchunksmax}, preferably by a factor of about~2.
\smallskip\noindent
The values of \.{memchunksmax} and \.{hashprime} affect only the speed of the
simulator, not its results---unless a very huge program is being simulated.
The stated defaults for \.{memchunksmax} and \.{hashprime}
should be adequate for almost all applications.
@ A \<cache spec> assigns a given value to a parameter affecting one of five
possible caches:
$$\vbox{\halign{$#$\hfil\cr
\<cache spec>\is\<cache name>\<cache parameter>\<decimal value>\<policy>\cr
\<cache name>\is\.{ITcache}\mid\.{DTcache}\mid\.{Icache}\mid\.{Dcache}
\mid\.{Scache}\cr
\<policy>\is\<empty>\mid\.{random}\mid\.{serial}
\mid\.{pseudolru}\mid\.{lru}\cr}}$$
The possibilities for \<cache parameter> are as follows:
\bull associativity (default 1), number of cache blocks per cache set;
must be a power of~2. (A cache with associativity~1 is said to be
``direct-mapped.'')
\bull blocksize (default 8), number of bytes per cache block; must be a power
of~2, at least equal to the granularity, and at most equal to~8192.
The blocksize of \.{ITcache} and \.{DTcache} must be~8.
\bull setsize (default 1), number of sets of cache blocks; must be a power
of~2. (A cache with set size~1 is said to be ``fully associative.'')
\bull granularity (default 8), number of bytes per ``dirty bit,'' used to
remember which items of data have changed since they were read from memory;
must be a power of~2 and at least~8. The granularity must be~8 if
\.{writeallocate} is~0.
\bull victimsize (default 0), number of cache blocks in the victim buffer,
which holds blocks removed from the main cache sets; must be zero or a power
of~2.
\bull writeback (default 0), is 1 in a ``write-back'' cache, which holds dirty
data as long as possible; is 0 in a ``write-through'' cache, which cleans
all data as soon as possible.
\bull writeallocate (default 0), is 1 in a ``write-allocate'' cache,
which remembers all recently written data;
is 0 in a ``write-around'' cache, which doesn't make space for newly written
data that fails to hit an existing cache block.
\bull accesstime (default 1), number of cycles to query the cache;
must be $\ge1$. (Hits in the S-cache actually require {\it twice}
the accesstime, once to query the tag and once to transmit the data.)
\bull copyintime (default 1), number of cycles to move a cache block from
its input buffer into the cache proper; must be $\ge1$.
\bull copyouttime (default 1), number of cycles to move a cache block
from the cache proper to its output buffer; must be $\ge1$.
\bull ports (default 1), number of processes that can simultaneous
query the cache; must be $\ge1$.
\smallskip
The \<policy> parameter should be nonempty only on cache specifications
for parameters
\.{associativity} and \.{victimsize}. If no replacement policy is specified,
\.{random} is the default. All four policies are equivalent when the
\.{associativity} or \.{victimsize} is~1; \.{pseudolru} is equivalent
to \.{lru} when the \.{associativity} or \.{victimsize} is~2.
The \.{granularity}, \.{writeback}, \.{writeallocate}, and \.{copyouttime}
parameters affect the performance only of the D-cache and S-cache; the other
three caches are read-only, so they never need to write their data.
The \.{ports} parameter affects the performance of the D-cache and
DT-cache, and (if the \.{PREGO} command is used) the performance of the
I-cache and IT-cache. The S-cache accommodates only one process at a time,
regardless of the number of specified ports.
Only the translation caches (the IT-cache and DT-cache) are present by
default. But if any specifications are given for, say, an I-cache,
all of the unspecified I-cache parameters take their default values.
The existence of an S-cache (secondary cache) implies the existence of both
I-cache and D-cache (primary caches for instructions and data).
The block size of the secondary cache must not be less than the block
size of the primary caches. The secondary cache must have the
same granularity as the D-cache.
@ A \<pipe spec> governs the execution time of potentially slow operations.
$$\vbox{\halign{$#$\hfil\cr
\<pipe spec>\is\<operation>\<pipeline times>\cr
\<pipeline times>\is\<decimal value>\mid\<pipeline times>\<decimal value>\cr}}$$
Here the \<operation> is one of the following:
\bull mul0 through \.{mul8} (default 10); the values for \.{mul}$j$ refer
to products in which the second operand is less than $2^{8j}$, where $j$
is as small as possible. Thus, for example, \.{mul1} applies to
nonzero one-byte multipliers.
\bull div (default 60); this applies to integer division, signed and unsigned.
\bull sh (default 1); this applies to left and right shifts, signed and
unsigned.
\bull mux (default 1); the multiplex operator.
\bull sadd (default 1); the sideways addition operator.
\bull mor (default 1); the boolean matrix multiplication operators \.{MOR} and
\.{MXOR}.
\bull fadd (default 4); floating point addition and subtraction.
\bull fmul (default 4); floating point multiplication.
\bull fdiv (default 40); floating point division.
\bull fsqrt (default 40); floating point square root.
\bull fint (default 4); floating point integerization.
\bull fix (default 2); conversion from floating to fixed, signed and unsigned.
\bull flot (default 2); conversion from fixed to floating, signed and unsigned.
\bull feps (default 4); floating comparison with respect to epsilon.
\smallskip\noindent
In each case one can specify a sequence of pipeline stages, with a positive
number of cycles to be spent in each stage. For example, a specification like
`\.{fmul}~\.{3}~\.{1}' would say that a functional unit that supports
\.{FMUL} takes a total of four cycles to compute the floating point product
in two stages; it can start working on a second product after three cycles
have gone by.
If a floating point operation has a subnormal input, \.{denin} is added to
the time for the first stage. If a floating point operation has a subnormal
result, \.{denout} is added to the time for the last stage.
@ The fourth and final kind of specification defines a functional unit:
$$\<functional spec>\is\.{unit}\ \<name>\<64 hexadecimal digits>$$
The symbolic name should be at most fifteen characters long.
The 64 hexadecimal digits contain 256 bits, with `1' for each supported
opcode; the most significant (leftmost) bit is for opcode 0 (\.{TRAP}),
and the least significant bit is for opcode 255 (\.{TRIP}).
For example, we can define a load/store unit (which handles register/memory
operations), a multiplication unit (which handles fixed and floating point
multiplication), a boolean unit (which handles only bitwise operations),
and a more general arithmetic-logical unit, as follows:
$$\vbox{\halign{\tt#\hfil\cr
unit LSU 00000000000000000000000000000000fffffffcfffffffc0000000000000000\cr
unit MUL 000080f000000000000000000000000000000000000000000000000000000000\cr
unit BIT 000000000000000000000000000000000000000000000000ffff00ff00ff0000\cr
unit ALU f0000000ffffffffffffffffffffffff0000000300000003ffffffffffffffff\cr
}}$$
The order in which units are specified is important, because \MMIX's dispatcher
will try to match each instruction with the first functional unit that
supports its opcode. Therefore it is best to list more specialized
units (like the \.{BIT} unit in this example) before more general ones;
this lets the specialized units have first chance at the instructions
they can handle.
There can be any number of functional units, having possibly identical
specifications. One should, however, give each unit a unique name
(e.g., \.{ALU1} and \.{ALU2} if there are two arithmetic-logical units),
since these names are used in diagnostic messages.
Opcodes that aren't supported by any specified unit will cause an
emulation trap.
@^emulation@>
@ Full details about the significance of all these parameters can be found
in the \.{mmix-pipe} module, which defines and discusses the data structures
that need to be configured and initialized.
Of course the specifications in a configuration file needn't make any sense,
nor need they be practically achievable. We could, for example, specify
a unit that handles only the two opcodes \.{NXOR} and \.{DIVUI};
we could specify 1-cycle division but pipelined 100-cycle shifts, or
1-cycle memory access but 100-cycle cache access. We could create
a thousand rename registers and issue a hundred instructions per cycle,
etc. Some combinations of parameters are clearly ridiculous.
But there remain a huge number of possibilities of interest, especially
as technology continues to evolve. By experimenting with configurations that
are extreme by present-day standards, we can see how much might be gained
if the corresponding hardware could be built economically.
@* Basic input/output. Let's get ready to program the |MMIX_config| subroutine
by building some simple infrastructure. First we need some macros to
print error messages.
@d errprint0(f) fprintf(stderr,f)
@d errprint1(f,a) fprintf(stderr,f,a)
@d errprint2(f,a,b) fprintf(stderr,f,a,b)
@d errprint3(f,a,b,c) fprintf(stderr,f,a,b,c)
@d panic(x)@+ {@+x;@+errprint0("!\n");@+exit(-1);@+}
@ And we need a place to look at the input.
@d BUF_SIZE 100 /* we don't need long lines */
@<Global variables@>=
FILE *config_file; /* input comes from here */
char token[BUF_SIZE]; /* and tokens are copied to here */
bool token_prescanned; /* does |token| contain the next token already? */
@ The |get_token| routine copies the next token of input into the |token|
buffer. After the input has ended, a final `\.{end}' is appended.
@<Subroutines@>=
static void get_token @,@,@[ARGS((void))@];@+@t}\6{@>
static void get_token() /* set |token| to the next token of the configuration file */
{
static char buffer[BUF_SIZE]; /* input lines go here */
static char *buf_pointer=buffer; /* this is our current position */
register char *p,*q;
if (token_prescanned) {
token_prescanned=false;@+ return;
}
while(1) { /* scan past white space */
if (*buf_pointer=='\0' || *buf_pointer=='\n' || *buf_pointer=='%') {
if (!fgets(buffer,BUF_SIZE,config_file)) {
strcpy(token,"end");@+return;
}
if (strlen(buffer)==BUF_SIZE-1 && buffer[BUF_SIZE-2]!='\n')
panic(errprint1("config file line too long: `%s...'",buffer));
@.config file line...@>
buf_pointer=buffer;
}@+else if (!isspace(*buf_pointer)) break;
else buf_pointer++;
}
for (p=buf_pointer,q=token;!isspace(*p) && *p!='%';p++,q++) *q=*p;
buf_pointer=p;@+ *q='\0';
return;
}
@ The |get_int| routine is called when we wish to input a decimal value.
It returns $-1$ if the next token isn't a string of decimal digits.
@<Sub...@>=
static int get_int @,@,@[ARGS((void))@];@+@t}\6{@>
static int get_int()
{@+ int v;
char *p;
get_token();
for (p=token,v=0; *p>='0' && *p<='9'; p++) v=10*v+*p-'0';
if (*p) return -1;
return v;
}
@ A simple data structure makes it fairly easy to deal with
parameter/value specifications.
@<Type definitions@>=
typedef struct {
char name[20]; /* symbolic name */
int *v; /* internal name */
int defval; /* default value */
int minval, maxval; /* minimum and maximum legal values */
bool power_of_two; /* must it be a power of two? */
} pv_spec;
@ Cache parameters are a bit more difficult, but still not bad.
@<Type...@>=
typedef enum {@!assoc,@!blksz,@!setsz,@!gran,@!vctsz,
@!wrb,@!wra,@!acctm,@!citm,@!cotm,@!prts} c_param;
@#
typedef struct {
char name[20]; /* symbolic name */
c_param v; /* internal code */
int defval; /* default value */
int minval, maxval; /* minimum and maximum legal values */
bool power_of_two; /* must it be a power of two? */
} cpv_spec;
@ Operation codes are the easiest of all.
@<Type...@>=
typedef struct {
char name[8]; /* symbolic name */
internal_opcode v; /* internal code */
int defval; /* default value */
} op_spec;
@ Most of the parameters are external variables declared in the header
file \.{mmix-pipe.h}; but some are private to this module. Here we
define the main tables used below.
@<Glob...@>=
int fetch_buf_size,write_buf_size,reorder_buf_size,mem_bus_bytes,hardware_PT;
int max_cycs=60;
pv_spec PV[]={@|
{"fetchbuffer", &fetch_buf_size, 4, 1, INT_MAX, false},@|
{"writebuffer", &write_buf_size, 2, 1, INT_MAX, false},@|
{"reorderbuffer", &reorder_buf_size, 5, 1, INT_MAX, false},@|
{"renameregs", &max_rename_regs, 5, 1, INT_MAX, false},@|
{"memslots", &max_mem_slots, 2, 1, INT_MAX, false},@|
{"localregs", &lring_size, 256, 256, 1024, true},@|
{"fetchmax", &fetch_max, 2, 1, INT_MAX, false},@|
{"dispatchmax", &dispatch_max, 1, 1, INT_MAX, false},@|
{"peekahead", &peekahead, 1, 0, INT_MAX, false},@|
{"commitmax", &commit_max, 1, 1, INT_MAX, false},@|
{"fremmax", &frem_max, 1, 1, INT_MAX, false},@|
{"denin",&denin_penalty, 1, 0, INT_MAX, false},@|
{"denout",&denout_penalty, 1, 0, INT_MAX, false},@|
{"writeholdingtime", &holding_time, 0, 0, INT_MAX, false},@|
{"memaddresstime", &mem_addr_time, 20, 1, INT_MAX, false},@|
{"memreadtime", &mem_read_time, 20, 1, INT_MAX, false},@|
{"memwritetime", &mem_write_time, 20, 1, INT_MAX, false},@|
{"membusbytes", &mem_bus_bytes, 8, 8, INT_MAX, true},@|
{"branchpredictbits", &bp_n, 0, 0, 8, false},@|
{"branchaddressbits", &bp_a, 0, 0, 32, false},@|
{"branchhistorybits", &bp_b, 0, 0, 32, false},@|
{"branchdualbits", &bp_c, 0, 0, 32, false},@|
{"hardwarepagetable", &hardware_PT, 1, 0, 1, false},@|
{"disablesecurity", (int*)&security_disabled, 0, 0, 1, false},@|
{"memchunksmax", &mem_chunks_max, 1000, 1, INT_MAX, false},@|
{"hashprime", &hash_prime, 2003, 2, INT_MAX, false}};
@#
cpv_spec CPV[]={@|
{"associativity", assoc, 1, 1, INT_MAX, true},@|
{"blocksize", blksz, 8, 8, 8192, true},@|
{"setsize", setsz, 1, 1, INT_MAX, true},@|
{"granularity", gran, 8, 8, 8192, true},@|
{"victimsize", vctsz, 0, 0, INT_MAX, true},@|
{"writeback", wrb, 0, 0, 1,false},@|
{"writeallocate", wra, 0, 0, 1,false},@|
{"accesstime", acctm, 1, 1, INT_MAX, false},@|
{"copyintime", citm, 1, 1, INT_MAX, false},@|
{"copyouttime", cotm, 1, 1, INT_MAX, false},@|
{"ports", prts, 1, 1, INT_MAX,false}};
@#
op_spec OP[]={@|
{"mul0", mul0, 10},
{"mul1", mul1, 10},
{"mul2", mul2, 10},
{"mul3", mul3, 10},
{"mul4", mul4, 10},@|
{"mul5", mul5, 10},
{"mul6", mul6, 10},
{"mul7", mul7, 10},
{"mul8", mul8, 10},@|
{"div", div, 60},
{"sh", sh, 1},
{"mux", mux, 1},
{"sadd", sadd, 1},
{"mor", mor, 1},@|
{"fadd", fadd, 4},
{"fmul", fmul, 4},
{"fdiv", fdiv, 40},
{"fsqrt", fsqrt, 40},
{"fint", fint, 4},@|
{"fix", fix, 2},
{"flot", flot, 2},
{"feps", feps, 4}};
int PV_size,CPV_size,OP_size; /* the number of entries in |PV|, |CPV|, |OP| */
@ The |new_cache| routine creates a \&{cache} structure with default values.
(These default values are ``hard-wired'' into the program, not actually
read from the |CPV| table.)
@<Sub...@>=
static cache* new_cache @,@,@[ARGS((char*))@];@+@t}\6{@>
static cache* new_cache(name)
char *name;
{@+register cache *c=(cache*)calloc(1,sizeof(cache));
if (!c) panic(errprint1("Can't allocate %s",name));
@.Can't allocate...@>
c->aa=1; /* default associativity, should equal |CPV[0].defval| */
c->bb=8; /* default blocksize */
c->cc=1; /* default setsize */
c->gg=8; /* default granularity */
c->vv=0; /* default victimsize */
c->repl=random; /* default replacement policy */
c->vrepl=random; /* default victim replacement policy */
c->mode=0; /* default mode is write-through and write-around */
c->access_time=c->copy_in_time=c->copy_out_time=1;
c->filler.ctl=&(c->filler_ctl);
c->filler_ctl.ptr_a=(void*)c;
c->filler_ctl.go.o.l=4;
c->flusher.ctl=&(c->flusher_ctl);
c->flusher_ctl.ptr_a=(void*)c;
c->flusher_ctl.go.o.l=4;
c->ports=1;
c->name=name;
return c;
}
@ @<Initialize to defaults@>=
PV_size=(sizeof PV)/sizeof(pv_spec);
CPV_size=(sizeof CPV)/sizeof(cpv_spec);
OP_size=(sizeof OP)/sizeof(op_spec);
ITcache=new_cache("ITcache");
DTcache=new_cache("DTcache");
Icache=Dcache=Scache=NULL;
for (j=0;j<PV_size;j++) *(PV[j].v)=PV[j].defval;
for (j=0;j<OP_size;j++) {
pipe_seq[OP[j].v][0]=OP[j].defval;
pipe_seq[OP[j].v][1]=0; /* one stage */
}
@* Reading the specs. Before we're ready to process the configuration file,
we need to count the number of functional units, so that we know
how much space to allocate for them.
A special background unit is always provided, just to make sure that
\.{TRAP} and \.{TRIP} instructions are handled by somebody.
@<Count and allocate the functional units@>=
funit_count=0;
while (strcmp(token,"end")!=0) {
get_token();
if (strcmp(token,"unit")==0) {
funit_count++;
get_token();@+get_token(); /* a unit might be named \.{unit} or \.{end} */
}
}
funit=(func*)calloc(funit_count+1,sizeof(func));
if (!funit) panic(errprint0("Can't allocate the functional units"));
@.Can't allocate...@>
strcpy(funit[funit_count].name,"%%");
@.\%\%@>
funit[funit_count].ops[0]=0x80000000; /* \.{TRAP} */
funit[funit_count].ops[7]=0x1; /* \.{TRIP} */
@ Now we can read the specifications and obey them. This program doesn't
bother to be very tolerant of errors, nor does it try to be very efficient.
Incidentally, the specifications don't have to be broken into individual lines
in any meaningful way. We simply read them token by token.
@<Record all the specs@>=
rewind(config_file);
funit_count=0;
token[0]='\0';
while (strcmp(token,"end")!=0) {
get_token();
if (strcmp(token,"end")==0) break;
@<If |token| is a parameter name, process a PV spec@>;
@<If |token| is a cache name, process a cache spec@>;
@<If |token| is an operation name, process a pipe spec@>;
if (strcmp(token,"unit")==0) @<Process a functional spec@>;
panic(errprint1(
"Configuration syntax error: Specification can't start with `%s'",token));
@.Configuration syntax error...@>
}
@ @<If |token| is a parameter name, process a PV spec@>=
for (j=0;j<PV_size;j++) if (strcmp(token,PV[j].name)==0) {
n=get_int();
if (n<PV[j].minval) panic(errprint2(
@.Configuration error...@>
"Configuration error: %s must be >= %d",PV[j].name,PV[j].minval));
if (n>PV[j].maxval) panic(errprint2(
"Configuration error: %s must be <= %d",PV[j].name,PV[j].maxval));
if (PV[j].power_of_two && (n&(n-1))) panic(errprint1(
"Configuration error: %s must be a power of 2",PV[j].name));
*(PV[j].v)=n;
break;
}
if (j<PV_size) continue;
@ @<If |token| is a cache name, process a cache spec@>=
if (strcmp(token,"ITcache")==0) {
pcs(ITcache);@+continue;
}@+else if (strcmp(token,"DTcache")==0) {
pcs(DTcache);@+continue;
}@+else if (strcmp(token,"Icache")==0) {
if (!Icache) Icache=new_cache("Icache");
pcs(Icache);@+continue;
}@+else if (strcmp(token,"Dcache")==0) {
if (!Dcache) Dcache=new_cache("Dcache");
pcs(Dcache);@+continue;
}@+else if (strcmp(token,"Scache")==0) {
if (!Icache) Icache=new_cache("Icache");
if (!Dcache) Dcache=new_cache("Dcache");
if (!Scache) Scache=new_cache("Scache");
pcs(Scache);@+continue;
}
@ @<Sub...@>=
static void ppol @,@,@[ARGS((replace_policy*))@];@+@t}\6{@>
static void ppol(rr) /* subroutine to scan for a replacement policy */
replace_policy *rr;
{
get_token();
if (strcmp(token,"random")==0) *rr=random;
else if (strcmp(token,"serial")==0) *rr=serial;
else if (strcmp(token,"pseudolru")==0) *rr=pseudo_lru;
else if (strcmp(token,"lru")==0) *rr=lru;
else token_prescanned=true; /* oops, we should rescan that token */
}
@ @<Sub...@>=
static void pcs @,@,@[ARGS((cache*))@];@+@t}\6{@>
static void pcs(c) /* subroutine to process a cache spec */
cache *c;
{
register int j,n;
get_token();
for (j=0;j<CPV_size;j++) if (strcmp(token,CPV[j].name)==0) break;
if (j==CPV_size) panic(errprint1(
"Configuration syntax error: `%s' isn't a cache parameter name",token));
@.Configuration syntax error...@>
n=get_int();
if (n<CPV[j].minval) panic(errprint2(
"Configuration error: %s must be >= %d",CPV[j].name,CPV[j].minval));
@.Configuration error...@>
if (n>CPV[j].maxval) panic(errprint2(
"Configuration error: %s must be <= %d",CPV[j].name,CPV[j].maxval));
if (CPV[j].power_of_two && (n&(n-1))) panic(errprint1(
"Configuration error: %s must be power of 2",CPV[j].name));
switch (CPV[j].v) {
case assoc: c->aa=n;@+ppol(&(c->repl));@+break;
case blksz: c->bb=n;@+break;
case setsz: c->cc=n;@+break;
case gran: c->gg=n;@+break;
case vctsz: c->vv=n;@+ppol(&(c->vrepl));@+break;
case wrb: c->mode=(c->mode&~WRITE_BACK)+n*WRITE_BACK;@+break;
case wra: c->mode=(c->mode&~WRITE_ALLOC)+n*WRITE_ALLOC;@+break;
case acctm:@+ if (n>max_cycs) max_cycs=n;
c->access_time=n;@+break;
case citm:@+ if (n>max_cycs) max_cycs=n;
c->copy_in_time=n;@+break;
case cotm:@+ if (n>max_cycs) max_cycs=n;
c->copy_out_time=n;@+break;
case prts: c->ports=n;@+break;
}
}
@ @<If |token| is an operation name, process a pipe spec@>=
for (j=0;j<OP_size;j++) if (strcmp(token,OP[j].name)==0) {
for (i=0;;i++) {
n=get_int();
if (n<0) break;
if (n==0) panic(errprint0(
"Configuration error: Pipeline cycles must be positive"));
@.Configuration error...@>
if (n>255) panic(errprint0(
"Configuration error: Pipeline cycles must be <= 255"));
if (n>max_cycs) max_cycs=n;
if (i>=pipe_limit) panic(errprint1(
"Configuration error: More than %d pipeline stages",pipe_limit));
pipe_seq[OP[j].v][i]=n;
}
token_prescanned=true;
break;
}
if (j<OP_size) continue;
@ @<Process a functional spec@>=
{
get_token();
if (strlen(token)>15) panic(errprint1(
"Configuration error: `%s' is more than 15 characters long",token));
@.Configuration error...@>
strcpy(funit[funit_count].name,token);
get_token();
if (strlen(token)!=64) panic(errprint1(
"Configuration error: unit %s doesn't have 64 hex digit specs",
funit[funit_count].name));
for (i=j=n=0;j<64;j++) {
if (token[j]>='0' && token[j]<='9') n=(n<<4)+(token[j]-'0');
else if (token[j]>='a' && token[j]<='f') n=(n<<4)+(token[j]-'a'+10);
else if (token[j]>='A' && token[j]<='F') n=(n<<4)+(token[j]-'A'+10);
else panic(errprint1(
"Configuration error: `%c' is not a hex digit",token[j]));
if ((j&0x7)==0x7) funit[funit_count].ops[i++]=n, n=0;
}
funit_count++;
continue;
}
@* Checking and allocating. The battle is only half over when we've
absorbed all the data of the configuration file. We still must check for
interactions between different quantities, and we must allocate
space for cache blocks, coroutines, etc.
One of the most difficult tasks facing us is to determine the maximum number
of pipeline stages needed by each functional unit. Let's tackle that first.
@<Allocate coroutines in each functional unit@>=
@<Build table of pipeline stages needed for each opcode@>;
for (j=0;j<=funit_count;j++) {
@<Determine the number of stages, |n|, needed by |funit[j]|@>;
funit[j].k=n;
funit[j].co=(coroutine*)calloc(n,sizeof(coroutine));
for (i=0;i<n;i++) {
funit[j].co[i].name=funit[j].name;
funit[j].co[i].stage=i+1;
}
}
@ @<Build table of pipeline stages needed for each opcode@>=
for (j=div;j<=max_pipe_op;j++) int_stages[j]=strlen(pipe_seq[j]);
for (;j<=max_real_command;j++) int_stages[j]=1;
for (j=mul0,n=0;j<=mul8;j++)
if (strlen(pipe_seq[j])>n) n=strlen(pipe_seq[j]);
int_stages[mul]=n;
int_stages[ld]=int_stages[st]=int_stages[frem]=2;
for (j=0;j<256;j++) stages[j]=int_stages[int_op[j]];
@ The |int_op| conversion table is similar to the |internal_op| array of
the \\{MMIX\_run} routine, but it replaces |divu| by |div|,
|fsub| by |fadd|, etc.
@<Glob...@>=
internal_opcode int_op[256]={@|
trap,fcmp,funeq,funeq,fadd,fix,fadd,fix,@|
flot,flot,flot,flot,flot,flot,flot,flot,@|
fmul,feps,feps,feps,fdiv,fsqrt,frem,fint,@|
mul,mul,mul,mul,div,div,div,div,@|
add,add,addu,addu,sub,sub,subu,subu,@|
addu,addu,addu,addu,addu,addu,addu,addu,@|
cmp,cmp,cmpu,cmpu,sub,sub,subu,subu,@|
sh,sh,sh,sh,sh,sh,sh,sh,@|
br,br,br,br,br,br,br,br,@|
br,br,br,br,br,br,br,br,@|
pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@|
pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@|
cset,cset,cset,cset,cset,cset,cset,cset,@|
cset,cset,cset,cset,cset,cset,cset,cset,@|
zset,zset,zset,zset,zset,zset,zset,zset,@|
zset,zset,zset,zset,zset,zset,zset,zset,@|
ld,ld,ld,ld,ld,ld,ld,ld,@|
ld,ld,ld,ld,ld,ld,ld,ld,@|
ld,ld,ld,ld,ld,ld,ld,ld,@|
ld,ld,ld,ld,prego,prego,go,go,@|
st,st,st,st,st,st,st,st,@|
st,st,st,st,st,st,st,st,@|
st,st,st,st,st,st,st,st,@|
st,st,st,st,st,st,pushgo,pushgo,@|
or,or,orn,orn,nor,nor,xor,xor,@|
and,and,andn,andn,nand,nand,nxor,nxor,@|
bdif,bdif,wdif,wdif,tdif,tdif,odif,odif,@|
mux,mux,sadd,sadd,mor,mor,mor,mor,@|
set,set,set,set,addu,addu,addu,addu,@|
or,or,or,or,andn,andn,andn,andn,@|
noop,noop,pushj,pushj,set,set,put,put,@|
pop,resume,save,unsave,sync,noop,get,trip};
int int_stages[max_real_command+1];
/* stages as function of |internal_opcode| */
int stages[256]; /* stages as function of |mmix_opcode| */
@ @<Determine the number of stages...@>=
for (i=n=0;i<256;i++)
if (((funit[j].ops[i>>5]<<(i&0x1f))&0x80000000) && stages[i]>n)
n=stages[i];
if (n==0) panic(errprint1(
"Configuration error: unit %s doesn't do anything",funit[j].name));
@.Configuration error...@>
@ The next hardest thing on our agenda is to set up the cache structure
fields that depend on the parameters. For example, although we have defined
the parameter in the |bb| field (the block size), we also need to compute the
|b|~field (log of the block size), and we must create the cache blocks
themselves.
@<Sub...@>=
static int lg @,@,@[ARGS((int))@];@+@t}\6{@>
static int lg(n) /* compute binary logarithm */
int n;
{@+register int j,l;
for (j=n,l=0;j;j>>=1) l++;
return l-1;
}
@ @<Sub...@>=
static void alloc_cache @,@,@[ARGS((cache*,char*))@];@+@t}\6{@>
static void alloc_cache(c,name)
cache *c;
char *name;
{@+register int j,k;
if (c->bb<c->gg) panic(errprint1(
"Configuration error: blocksize of %s is less than granularity",name));
@.Configuration error...@>
if (name[1]=='T' && c->bb!=8) panic(errprint1(
"Configuration error: blocksize of %s must be 8",name));
c->a=lg(c->aa);
c->b=lg(c->bb);
c->c=lg(c->cc);
c->g=lg(c->gg);
c->v=lg(c->vv);
c->tagmask=-(1<<(c->b+c->c));
if (c->a+c->b+c->c>=32) panic(errprint1(
"Configuration error: %s has >= 4 gigabytes of data",name));
if (c->gg!=8 && !(c->mode&WRITE_ALLOC)) panic(errprint2(
"Configuration error: %s does write-around with granularity %d",
name,c->gg));
@<Allocate the cache sets for cache |c|@>;
if (c->vv) @<Allocate the victim cache for cache |c|@>;
c->inbuf.dirty=(char*)calloc(c->bb>>c->g,sizeof(char));
if (!c->inbuf.dirty) panic(errprint1(
"Can't allocate dirty bits for inbuffer of %s",name));
@.Can't allocate...@>
c->inbuf.data=(octa *)calloc(c->bb>>3,sizeof(octa));
if (!c->inbuf.data) panic(errprint1(
"Can't allocate data for inbuffer of %s",name));
c->outbuf.dirty=(char*)calloc(c->bb>>c->g,sizeof(char));
if (!c->outbuf.dirty) panic(errprint1(
"Can't allocate dirty bits for outbuffer of %s",name));
c->outbuf.data=(octa *)calloc(c->bb>>3,sizeof(octa));
if (!c->outbuf.data) panic(errprint1(
"Can't allocate data for outbuffer of %s",name));
if (name[0]!='S') @<Allocate reader coroutines for cache |c|@>;
}
@ @d sign_bit 0x80000000
@<Allocate the cache sets for cache |c|@>=
c->set=(cacheset *)calloc(c->cc,sizeof(cacheset));
if (!c->set) panic(errprint1(
"Can't allocate cache sets for %s",name));
@.Can't allocate...@>
for (j=0;j<c->cc;j++) {
c->set[j]=(cacheblock *)calloc(c->aa,sizeof(cacheblock));
if (!c->set[j]) panic(errprint2(
"Can't allocate cache blocks for set %d of %s",j,name));
for (k=0;k<c->aa;k++) {
c->set[j][k].tag.h=sign_bit; /* invalid tag */
c->set[j][k].dirty=(char *)calloc(c->bb>>c->g,sizeof(char));
if (!c->set[j][k].dirty) panic(errprint3(
"Can't allocate dirty bits for block %d of set %d of %s",k,j,name));
c->set[j][k].data=(octa *)calloc(c->bb>>3,sizeof(octa));
if (!c->set[j][k].data) panic(errprint3(
"Can't allocate data for block %d of set %d of %s",k,j,name));
}
}
@ @<Allocate the victim cache for cache |c|@>=
{
c->victim=(cacheblock*)calloc(c->vv,sizeof(cacheblock));
if (!c->victim) panic(errprint1(
"Can't allocate blocks for victim cache of %s",name));
for (k=0;k<c->vv;k++) {
c->victim[k].tag.h=sign_bit; /* invalid tag */
c->victim[k].dirty=(char *)calloc(c->bb>>c->g,sizeof(char));
if (!c->victim[k].dirty) panic(errprint2(
"Can't allocate dirty bits for block %d of victim cache of %s",
k,name));
@.Can't allocate...@>
c->victim[k].data=(octa *)calloc(c->bb>>3,sizeof(octa));
if (!c->victim[k].data) panic(errprint2(
"Can't allocate data for block %d of victim cache of %s",k,name));
}
}
@ @<Allocate reader coroutines for cache |c|@>=
{
c->reader=(coroutine*)calloc(c->ports,sizeof(coroutine));
if (!c->reader) panic(errprint1(
@.Can't allocate...@>
"Can't allocate readers for %s",name));
for (j=0;j<c->ports;j++) {
c->reader[j].stage=vanish;
c->reader[j].name=(name[0]=='D'? (name[1]=='T'? "DTreader": "Dreader"):
(name[1]=='T'? "ITreader": "Ireader"));
}
}
@ @<Allocate the caches@>=
alloc_cache(ITcache,"ITcache");
ITcache->filler.name="ITfiller";@+ ITcache->filler.stage=fill_from_virt;
alloc_cache(DTcache,"DTcache");
DTcache->filler.name="DTfiller";@+ DTcache->filler.stage=fill_from_virt;
if (Icache) {
alloc_cache(Icache,"Icache");
Icache->filler.name="Ifiller";@+ Icache->filler.stage=fill_from_mem;
}
if (Dcache) {
alloc_cache(Dcache,"Dcache");
Dcache->filler.name="Dfiller";@+ Dcache->filler.stage=fill_from_mem;
Dcache->flusher.name="Dflusher";@+ Dcache->flusher.stage=flush_to_mem;
}
if (Scache) {
alloc_cache(Scache,"Scache");
if (Scache->bb<Icache->bb) panic(errprint0(
"Configuration error: Scache blocks smaller than Icache blocks"));
@.Configuration error...@>
if (Scache->bb<Dcache->bb) panic(errprint0(
"Configuration error: Scache blocks smaller than Dcache blocks"));
if (Scache->gg!=Dcache->gg) panic(errprint0(
"Configuration error: Scache granularity differs from the Dcache"));
Icache->filler.stage=fill_from_S;
Dcache->filler.stage=fill_from_S;@+ Dcache->flusher.stage=flush_to_S;
Scache->filler.name="Sfiller";@+ Scache->filler.stage=fill_from_mem;
Scache->flusher.name="Sflusher";@+ Scache->flusher.stage=flush_to_mem;
}
@ Now we are nearly done. The only nontrivial task remaining is
to allocate the ring of queues for coroutine scheduling; for this we
need to determine the maximum waiting time that will occur between
scheduler and schedulee.
@<Allocate the scheduling queue@>=
bus_words=mem_bus_bytes>>3;
j=(mem_read_time<mem_write_time? mem_write_time: mem_read_time);
n=1;
if (Scache && Scache->bb>n) n=Scache->bb;
if (Icache && Icache->bb>n) n=Icache->bb;
if (Dcache && Dcache->bb>n) n=Dcache->bb;
n=mem_addr_time+((int)(n+mem_bus_bytes-1)/mem_bus_bytes)*j;
if (n>max_cycs) max_cycs=n; /* now |max_cycs| bounds the waiting time */
ring_size=max_cycs+1;
ring=(coroutine *)calloc(ring_size,sizeof(coroutine));
if (!ring) panic(errprint0("Can't allocate the scheduling ring"));
@.Can't allocate...@>
{@+register coroutine *p;
for (p=ring;p<ring+ring_size;p++) {
p->name=""; /* header nodes are nameless */
p->stage=max_stage;
}
}
@ @s chunknode int
@<Touch up last-minute trivia@>=
if (hash_prime<=mem_chunks_max) panic(errprint0(
"Configuration error: hashprime must exceed memchunksmax"));
@.Configuration error...@>
mem_hash=(chunknode *)calloc(hash_prime+1,sizeof(chunknode));
if (!mem_hash) panic(errprint0("Can't allocate the hash table"));
@.Can't allocate...@>
mem_hash[0].chunk=(octa*)calloc(1<<13,sizeof(octa));
if (!mem_hash[0].chunk) panic(errprint0("Can't allocate chunk 0"));
mem_hash[hash_prime].chunk=(octa*)calloc(1<<13,sizeof(octa));
if (!mem_hash[hash_prime].chunk) panic(errprint0("Can't allocate 0 chunk"));
mem_chunks=1;
fetch_bot=(fetch*)calloc(fetch_buf_size+1,sizeof(fetch));
if (!fetch_bot) panic(errprint0("Can't allocate the fetch buffer"));
fetch_top=fetch_bot+fetch_buf_size;
reorder_bot=(control*)calloc(reorder_buf_size+1,sizeof(control));
if (!reorder_bot) panic(errprint0("Can't allocate the reorder buffer"));
reorder_top=reorder_bot+reorder_buf_size;
wbuf_bot=(write_node*)calloc(write_buf_size+1,sizeof(write_node));
if (!wbuf_bot) panic(errprint0("Can't allocate the write buffer"));
wbuf_top=wbuf_bot+write_buf_size;
if (bp_n==0) bp_table=NULL;
else { /* a branch prediction table is desired */
if (bp_a+bp_b+bp_c>=31) panic(errprint0(
"Configuration error: Branch table has >= 2 gigabytes of data"));
bp_table=(char*)calloc(1<<(bp_a+bp_b+bp_c),sizeof(char));
if (!bp_table) panic(errprint0("Can't allocate the branch table"));