# Config file for Advcomparch@cslab.ntua.ece.gr 2012-2013 [general] enable_icache_modeling = true [perf_model/core] logical_cpus = 1 # number of SMT threads per core type = interval core_model = nehalem [perf_model/core/interval_timer] dispatch_width = 64 window_size = 128 num_outstanding_loadstores = 10 [perf_model/sync] reschedule_cost = 1000 [caching_protocol] type = parametric_dram_directory_msi [perf_model/branch_predictor] type = pentium_m mispredict_penalty=17 # From microarchitecture.pdf (Nehalem has a longer pipeline than Core2) [perf_model/tlb] ##penalty = 1000 # ????? [perf_model/itlb] #size = 128 associativity = 4 [perf_model/dtlb] #size = 256 # L2 DTLB associativity = 4 [perf_model/cache] levels = 2 [perf_model/l1_icache] perfect = false cache_block_size = 64 cache_size = 32 associativity = 4 address_hash = mask replacement_policy = lru data_access_time = 4 tags_access_time = 1 perf_model_type = parallel writethrough = 0 shared_cores = 1 [perf_model/l1_dcache] perfect = false cache_block_size = 64 cache_size = 32 associativity = 8 address_hash = mask replacement_policy = lru data_access_time = 4 tags_access_time = 1 perf_model_type = parallel writethrough = 0 shared_cores = 1 [perf_model/l2_cache] perfect = false cache_block_size = 128 cache_size = 1024 associativity = 8 address_hash = mask replacement_policy = lru data_access_time = 8 # 8.something according to membench, -1 cycle L1 tag access time # http://www.realworldtech.com/page.cfm?ArticleID=RWT040208182719&p=7 tags_access_time = 3 # Total neighbor L1/L2 access time is around 40/70 cycles (60-70 when it's coming out of L1) writeback_time = 50 # L3 hit time will be added perf_model_type = parallel writethrough = 0 shared_cores = 1 [clock_skew_minimization] scheme = barrier [clock_skew_minimization/barrier] quantum = 100 [dvfs] transition_latency = 2000 # In ns, "under 2 microseconds" according to http://download.intel.com/design/intarch/papers/323671.pdf (page 8) [dvfs/simple] cores_per_socket = 1 [power] vdd = 1.2 # Volts technology_node = 45 # nm # Configuration file for Xeon X5550 Gainestown # See http://en.wikipedia.org/wiki/Gainestown_(microprocessor)#Gainestown # and http://ark.intel.com/products/37106 [perf_model/core] frequency = 2.66 [perf_model/dram_directory] # total_entries = number of entries per directory controller. total_entries = 1048576 associativity = 16 directory_type = full_map home_lookup_param = 7 [perf_model/dram] # -1 means that we have a number of distributed DRAM controllers (4 in this case) num_controllers = -1 controllers_interleaving = 4 # DRAM access latency in nanoseconds. Should not include L1-LLC tag access time, directory access time (14 cycles = 5.2 ns), # or network time [(cache line size + 2*{overhead=40}) / network bandwidth = 18 ns] # Membench says 175 cycles @ 2.66 GHz = 66 ns total latency = 45 per_controller_bandwidth = 7.6 # In GB/s, as measured by core_validation-dram chips_per_dimm = 8 dimms_per_controller = 4 [network] memory_model_1 = bus memory_model_2 = bus [network/bus] bandwidth = 25.6 # in GB/s. Actually, it's 12.8 GB/s per direction and per connected chip pair ignore_local_traffic = true # Memory controllers are on-chip, so traffic from core0 to dram0 does not use the QPI links