# Config file for Advcomparch@cslab.ntua.ece.gr 2012-2013

[general]
enable_icache_modeling = true

[perf_model/core]
logical_cpus = 1 # number of SMT threads per core
type = interval
core_model = nehalem

[perf_model/core/interval_timer]
dispatch_width = 64
window_size = 128
num_outstanding_loadstores = 10

[perf_model/sync]
reschedule_cost = 1000

[caching_protocol]
type = parametric_dram_directory_msi

[perf_model/branch_predictor]
type = pentium_m
mispredict_penalty=17 # From microarchitecture.pdf (Nehalem has a longer pipeline than Core2)

[perf_model/tlb]
##penalty = 1000 # ?????

[perf_model/itlb]
#size = 128
associativity = 4

[perf_model/dtlb]
#size = 256 # L2 DTLB
associativity = 4

[perf_model/cache]
levels = 2

[perf_model/l1_icache]
perfect = false
cache_block_size = 64
cache_size = 32
associativity = 4
address_hash = mask
replacement_policy = lru
data_access_time = 4
tags_access_time = 1
perf_model_type = parallel
writethrough = 0
shared_cores = 1

[perf_model/l1_dcache]
perfect = false
cache_block_size = 64
cache_size = 32
associativity = 8
address_hash = mask
replacement_policy = lru
data_access_time = 4
tags_access_time = 1
perf_model_type = parallel
writethrough = 0
shared_cores = 1

[perf_model/l2_cache]
perfect = false
cache_block_size = 128
cache_size = 1024
associativity = 8
address_hash = mask
replacement_policy = lru
data_access_time = 8 # 8.something according to membench, -1 cycle L1 tag access time
# http://www.realworldtech.com/page.cfm?ArticleID=RWT040208182719&p=7
tags_access_time = 3
# Total neighbor L1/L2 access time is around 40/70 cycles (60-70 when it's coming out of L1)
writeback_time = 50 # L3 hit time will be added
perf_model_type = parallel
writethrough = 0
shared_cores = 1

[clock_skew_minimization]
scheme = barrier

[clock_skew_minimization/barrier]
quantum = 100

[dvfs]
transition_latency = 2000 # In ns, "under 2 microseconds" according to http://download.intel.com/design/intarch/papers/323671.pdf (page 8)

[dvfs/simple]
cores_per_socket = 1

[power]
vdd = 1.2 # Volts
technology_node = 45 # nm

# Configuration file for Xeon X5550 Gainestown
# See http://en.wikipedia.org/wiki/Gainestown_(microprocessor)#Gainestown
# and http://ark.intel.com/products/37106

[perf_model/core]
frequency = 2.66

[perf_model/dram_directory]
# total_entries = number of entries per directory controller.
total_entries = 1048576
associativity = 16
directory_type = full_map
home_lookup_param = 7

[perf_model/dram]
# -1 means that we have a number of distributed DRAM controllers (4 in this case)
num_controllers = -1
controllers_interleaving = 4
# DRAM access latency in nanoseconds. Should not include L1-LLC tag access time, directory access time (14 cycles = 5.2 ns),
# or network time [(cache line size + 2*{overhead=40}) / network bandwidth = 18 ns]
# Membench says 175 cycles @ 2.66 GHz = 66 ns total
latency = 45
per_controller_bandwidth = 7.6              # In GB/s, as measured by core_validation-dram
chips_per_dimm = 8
dimms_per_controller = 4

[network]
memory_model_1 = bus
memory_model_2 = bus

[network/bus]
bandwidth = 25.6 # in GB/s. Actually, it's 12.8 GB/s per direction and per connected chip pair
ignore_local_traffic = true # Memory controllers are on-chip, so traffic from core0 to dram0 does not use the QPI links