1 /* SPDX-License-Identifier: GPL-2.0-only */
2 
3 #include <assert.h>
4 #include <commonlib/helpers.h>
5 #include <console/console.h>
6 #include <cpu/intel/model_206ax/model_206ax.h>
7 #include <device/mmio.h>
8 #include <device/pci_ops.h>
9 #include <northbridge/intel/sandybridge/chip.h>
10 #include <device/pci_def.h>
11 #include <delay.h>
12 #include <types.h>
13 
14 #include "raminit_common.h"
15 #include "raminit_tables.h"
16 #include "sandybridge.h"
17 
18 /* FIXME: no support for 3-channel chipsets */
19 
sfence(void)20 static void sfence(void)
21 {
22 	asm volatile ("sfence");
23 }
24 
25 /* Toggle IO reset bit */
toggle_io_reset(void)26 static void toggle_io_reset(void)
27 {
28 	u32 r32 = mchbar_read32(MC_INIT_STATE_G);
29 	mchbar_write32(MC_INIT_STATE_G, r32 |  (1 << 5));
30 	udelay(1);
31 	mchbar_write32(MC_INIT_STATE_G, r32 & ~(1 << 5));
32 	udelay(1);
33 }
34 
get_XOVER_CLK(u8 rankmap)35 static u32 get_XOVER_CLK(u8 rankmap)
36 {
37 	return rankmap << 24;
38 }
39 
get_XOVER_CMD(u8 rankmap)40 static u32 get_XOVER_CMD(u8 rankmap)
41 {
42 	u32 reg;
43 
44 	/* Enable xover cmd */
45 	reg = 1 << 14;
46 
47 	/* Enable xover ctl */
48 	if (rankmap & 0x03)
49 		reg |= (1 << 17);
50 
51 	if (rankmap & 0x0c)
52 		reg |= (1 << 26);
53 
54 	return reg;
55 }
56 
dram_find_common_params(ramctr_timing * ctrl)57 void dram_find_common_params(ramctr_timing *ctrl)
58 {
59 	size_t valid_dimms;
60 	int channel, slot;
61 	dimm_info *dimms = &ctrl->info;
62 
63 	ctrl->cas_supported = (1 << (MAX_CAS - MIN_CAS + 1)) - 1;
64 	valid_dimms = 0;
65 
66 	FOR_ALL_CHANNELS for (slot = 0; slot < 2; slot++) {
67 		const struct dimm_attr_ddr3_st *dimm = &dimms->dimm[channel][slot];
68 		if (dimm->dram_type != SPD_MEMORY_TYPE_SDRAM_DDR3)
69 			continue;
70 
71 		valid_dimms++;
72 
73 		/* Find all possible CAS combinations */
74 		ctrl->cas_supported &= dimm->cas_supported;
75 
76 		/* Find the smallest common latencies supported by all DIMMs */
77 		ctrl->tCK  = MAX(ctrl->tCK,  dimm->tCK);
78 		ctrl->tAA  = MAX(ctrl->tAA,  dimm->tAA);
79 		ctrl->tWR  = MAX(ctrl->tWR,  dimm->tWR);
80 		ctrl->tRCD = MAX(ctrl->tRCD, dimm->tRCD);
81 		ctrl->tRRD = MAX(ctrl->tRRD, dimm->tRRD);
82 		ctrl->tRP  = MAX(ctrl->tRP,  dimm->tRP);
83 		ctrl->tRAS = MAX(ctrl->tRAS, dimm->tRAS);
84 		ctrl->tRFC = MAX(ctrl->tRFC, dimm->tRFC);
85 		ctrl->tWTR = MAX(ctrl->tWTR, dimm->tWTR);
86 		ctrl->tRTP = MAX(ctrl->tRTP, dimm->tRTP);
87 		ctrl->tFAW = MAX(ctrl->tFAW, dimm->tFAW);
88 		ctrl->tCWL = MAX(ctrl->tCWL, dimm->tCWL);
89 		ctrl->tCMD = MAX(ctrl->tCMD, dimm->tCMD);
90 	}
91 
92 	if (!ctrl->cas_supported)
93 		die("Unsupported DIMM combination. DIMMS do not support common CAS latency");
94 
95 	if (!valid_dimms)
96 		die("No valid DIMMs found");
97 }
98 
dram_xover(ramctr_timing * ctrl)99 void dram_xover(ramctr_timing *ctrl)
100 {
101 	u32 reg;
102 	int channel;
103 
104 	FOR_ALL_CHANNELS {
105 		/* Enable xover clk */
106 		reg = get_XOVER_CLK(ctrl->rankmap[channel]);
107 		printram("XOVER CLK [%x] = %x\n", GDCRCKPICODE_ch(channel), reg);
108 		mchbar_write32(GDCRCKPICODE_ch(channel), reg);
109 
110 		/* Enable xover ctl & xover cmd */
111 		reg = get_XOVER_CMD(ctrl->rankmap[channel]);
112 		printram("XOVER CMD [%x] = %x\n", GDCRCMDPICODING_ch(channel), reg);
113 		mchbar_write32(GDCRCMDPICODING_ch(channel), reg);
114 	}
115 }
116 
dram_odt_stretch(ramctr_timing * ctrl,int channel)117 static void dram_odt_stretch(ramctr_timing *ctrl, int channel)
118 {
119 	u32 addr, stretch;
120 
121 	stretch = ctrl->ref_card_offset[channel];
122 	/*
123 	 * ODT stretch:
124 	 * Delay ODT signal by stretch value. Useful for multi DIMM setups on the same channel.
125 	 */
126 	if (IS_SANDY_CPU(ctrl->cpu) && IS_SANDY_CPU_C(ctrl->cpu)) {
127 		if (stretch == 2)
128 			stretch = 3;
129 
130 		addr = SCHED_SECOND_CBIT_ch(channel);
131 		mchbar_clrsetbits32(addr, 0xf << 10, stretch << 12 | stretch << 10);
132 		printk(RAM_DEBUG, "OTHP Workaround [%x] = %x\n", addr, mchbar_read32(addr));
133 	} else {
134 		addr = TC_OTHP_ch(channel);
135 		union tc_othp_reg tc_othp = {
136 			.raw = mchbar_read32(addr),
137 		};
138 		tc_othp.odt_delay_d0 = stretch;
139 		tc_othp.odt_delay_d1 = stretch;
140 		mchbar_write32(addr, tc_othp.raw);
141 		printk(RAM_DEBUG, "OTHP [%x] = %x\n", addr, mchbar_read32(addr));
142 	}
143 }
144 
dram_timing_regs(ramctr_timing * ctrl)145 void dram_timing_regs(ramctr_timing *ctrl)
146 {
147 	int channel;
148 
149 	/* BIN parameters */
150 	const union tc_dbp_reg tc_dbp = {
151 		.tRCD = ctrl->tRCD,
152 		.tRP  = ctrl->tRP,
153 		.tAA  = ctrl->CAS,
154 		.tCWL = ctrl->CWL,
155 		.tRAS = ctrl->tRAS,
156 	};
157 
158 	/* Regular access parameters */
159 	const union tc_rap_reg tc_rap = {
160 		.tRRD = ctrl->tRRD,
161 		.tRTP = ctrl->tRTP,
162 		.tCKE = ctrl->tCKE,
163 		.tWTR = ctrl->tWTR,
164 		.tFAW = ctrl->tFAW,
165 		.tWR  = ctrl->tWR,
166 		.tCMD = 3,
167 	};
168 
169 	/* Other parameters */
170 	const union tc_othp_reg tc_othp = {
171 		.tXPDLL  = MIN(ctrl->tXPDLL, 31),
172 		.tXP     = MIN(ctrl->tXP, 7),
173 		.tAONPD  = ctrl->tAONPD,
174 		.tCPDED  = 1,
175 		.tPRPDEN = 1,
176 	};
177 
178 	/*
179 	 * If tXP and tXPDLL are very high, they no longer fit in the bitfields
180 	 * of the TC_OTHP register. If so, we set bits in TC_DTP to compensate.
181 	 * This can only happen on Ivy Bridge, and when overclocking the RAM.
182 	 */
183 	const union tc_dtp_reg tc_dtp = {
184 		.overclock_tXP    = ctrl->tXP >= 8,
185 		.overclock_tXPDLL = ctrl->tXPDLL >= 32,
186 	};
187 
188 	/*
189 	 * TC-Refresh timing parameters:
190 	 *   The tREFIx9 field should be programmed to minimum of 8.9 * tREFI (to allow
191 	 *   for possible delays from ZQ or isoc) and tRASmax (70us) divided by 1024.
192 	 */
193 	const u32 val32 = MIN((ctrl->tREFI * 89) / 10, (70000 << 8) / ctrl->tCK);
194 
195 	const union tc_rftp_reg tc_rftp = {
196 		.tREFI   = ctrl->tREFI,
197 		.tRFC    = ctrl->tRFC,
198 		.tREFIx9 = val32 / 1024,
199 	};
200 
201 	/* Self-refresh timing parameters */
202 	const union tc_srftp_reg tc_srftp = {
203 		.tXSDLL     = tDLLK,
204 		.tXS_offset = ctrl->tXSOffset,
205 		.tZQOPER    = tDLLK - ctrl->tXSOffset,
206 		.tMOD       = ctrl->tMOD - 8,
207 	};
208 
209 	FOR_ALL_CHANNELS {
210 		printram("DBP [%x] = %x\n", TC_DBP_ch(channel), tc_dbp.raw);
211 		mchbar_write32(TC_DBP_ch(channel), tc_dbp.raw);
212 
213 		printram("RAP [%x] = %x\n", TC_RAP_ch(channel), tc_rap.raw);
214 		mchbar_write32(TC_RAP_ch(channel), tc_rap.raw);
215 
216 		printram("OTHP [%x] = %x\n", TC_OTHP_ch(channel), tc_othp.raw);
217 		mchbar_write32(TC_OTHP_ch(channel), tc_othp.raw);
218 
219 		if (IS_IVY_CPU(ctrl->cpu)) {
220 			/* Debug parameters - only applies to Ivy Bridge */
221 			mchbar_write32(TC_DTP_ch(channel), tc_dtp.raw);
222 		}
223 
224 		dram_odt_stretch(ctrl, channel);
225 
226 		printram("REFI [%x] = %x\n", TC_RFTP_ch(channel), tc_rftp.raw);
227 		mchbar_write32(TC_RFTP_ch(channel), tc_rftp.raw);
228 
229 		union tc_rfp_reg tc_rfp = {
230 			.raw = mchbar_read32(TC_RFP_ch(channel)),
231 		};
232 		tc_rfp.oref_ri = 0xff;
233 		mchbar_write32(TC_RFP_ch(channel), tc_rfp.raw);
234 
235 		printram("SRFTP [%x] = %x\n", TC_SRFTP_ch(channel), tc_srftp.raw);
236 		mchbar_write32(TC_SRFTP_ch(channel), tc_srftp.raw);
237 	}
238 }
239 
dram_dimm_mapping(ramctr_timing * ctrl)240 void dram_dimm_mapping(ramctr_timing *ctrl)
241 {
242 	int channel;
243 	dimm_info *info = &ctrl->info;
244 
245 	FOR_ALL_CHANNELS {
246 		struct dimm_attr_ddr3_st *dimmA, *dimmB;
247 		u32 reg = 0;
248 
249 		if (info->dimm[channel][0].size_mb >= info->dimm[channel][1].size_mb) {
250 			dimmA = &info->dimm[channel][0];
251 			dimmB = &info->dimm[channel][1];
252 			reg |= (0 << 16);
253 		} else {
254 			dimmA = &info->dimm[channel][1];
255 			dimmB = &info->dimm[channel][0];
256 			reg |= (1 << 16);
257 		}
258 
259 		if (dimmA && (dimmA->ranks > 0)) {
260 			reg |= (dimmA->size_mb / 256) <<  0;
261 			reg |= (dimmA->ranks - 1)     << 17;
262 			reg |= (dimmA->width / 8 - 1) << 19;
263 		}
264 
265 		if (dimmB && (dimmB->ranks > 0)) {
266 			reg |= (dimmB->size_mb / 256) <<  8;
267 			reg |= (dimmB->ranks - 1)     << 18;
268 			reg |= (dimmB->width / 8 - 1) << 20;
269 		}
270 
271 		/*
272 		 * Rank interleave: Bit 16 of the physical address space sets
273 		 * the rank to use in a dual single rank DIMM configuration.
274 		 * That results in every 64KiB being interleaved between two ranks.
275 		 */
276 		reg |= 1 << 21;
277 		/* Enhanced interleave */
278 		reg |= 1 << 22;
279 
280 		if ((dimmA && (dimmA->ranks > 0)) || (dimmB && (dimmB->ranks > 0))) {
281 			ctrl->mad_dimm[channel] = reg;
282 		} else {
283 			ctrl->mad_dimm[channel] = 0;
284 		}
285 	}
286 }
287 
dram_dimm_set_mapping(ramctr_timing * ctrl,int training)288 void dram_dimm_set_mapping(ramctr_timing *ctrl, int training)
289 {
290 	int channel;
291 	u32 ecc;
292 
293 	if (ctrl->ecc_enabled)
294 		ecc = training ? (1 << 24) : (3 << 24);
295 	else
296 		ecc = 0;
297 
298 	FOR_ALL_CHANNELS {
299 		mchbar_write32(MAD_DIMM(channel), ctrl->mad_dimm[channel] | ecc);
300 	}
301 
302 	if (ctrl->ecc_enabled)
303 		udelay(10);
304 }
305 
dram_zones(ramctr_timing * ctrl,int training)306 void dram_zones(ramctr_timing *ctrl, int training)
307 {
308 	u32 reg, ch0size, ch1size;
309 	u8 val;
310 	reg = 0;
311 	val = 0;
312 
313 	if (training) {
314 		ch0size = ctrl->channel_size_mb[0] ? 256 : 0;
315 		ch1size = ctrl->channel_size_mb[1] ? 256 : 0;
316 	} else {
317 		ch0size = ctrl->channel_size_mb[0];
318 		ch1size = ctrl->channel_size_mb[1];
319 	}
320 
321 	if (ch0size >= ch1size) {
322 		reg = mchbar_read32(MAD_ZR);
323 		val = ch1size / 256;
324 		reg = (reg & ~0xff000000) | val << 24;
325 		reg = (reg & ~0x00ff0000) | (2 * val) << 16;
326 		mchbar_write32(MAD_ZR, reg);
327 		mchbar_write32(MAD_CHNL, 0x24);
328 
329 	} else {
330 		reg = mchbar_read32(MAD_ZR);
331 		val = ch0size / 256;
332 		reg = (reg & ~0xff000000) | val << 24;
333 		reg = (reg & ~0x00ff0000) | (2 * val) << 16;
334 		mchbar_write32(MAD_ZR, reg);
335 		mchbar_write32(MAD_CHNL, 0x21);
336 	}
337 }
338 
339 /*
340  * Returns the ECC mode the NB is running at. It takes precedence over ECC capability.
341  * The ME/PCU/.. has the ability to change this.
342  * Return 0: ECC is optional
343  * Return 1: ECC is forced
344  */
get_host_ecc_forced(void)345 bool get_host_ecc_forced(void)
346 {
347 	/* read Capabilities A Register */
348 	const u32 reg32 = pci_read_config32(HOST_BRIDGE, CAPID0_A);
349 	return !!(reg32 & (1 << 24));
350 }
351 
352 /*
353  * Returns the ECC capability.
354  * The ME/PCU/.. has the ability to change this.
355  * Return 0: ECC is disabled
356  * Return 1: ECC is possible
357  */
get_host_ecc_cap(void)358 bool get_host_ecc_cap(void)
359 {
360 	/* read Capabilities A Register */
361 	const u32 reg32 = pci_read_config32(HOST_BRIDGE, CAPID0_A);
362 	return !(reg32 & (1 << 25));
363 }
364 
365 #define DEFAULT_PCI_MMIO_SIZE 2048
366 
dram_memorymap(ramctr_timing * ctrl,int me_uma_size)367 void dram_memorymap(ramctr_timing *ctrl, int me_uma_size)
368 {
369 	u32 reg, val, reclaim, tom, gfxstolen, gttsize;
370 	size_t tsegbase, toludbase, remapbase, gfxstolenbase, mmiosize, gttbase;
371 	size_t tsegsize, touudbase, remaplimit, mestolenbase, tsegbasedelta;
372 	uint16_t ggc;
373 
374 	mmiosize = DEFAULT_PCI_MMIO_SIZE;
375 
376 	ggc = pci_read_config16(HOST_BRIDGE, GGC);
377 	if (!(ggc & 2)) {
378 		gfxstolen = ((ggc >> 3) & 0x1f) * 32;
379 		gttsize   = ((ggc >> 8) & 0x3);
380 	} else {
381 		gfxstolen = 0;
382 		gttsize   = 0;
383 	}
384 
385 	tsegsize = CONFIG_SMM_TSEG_SIZE >> 20;
386 
387 	tom = ctrl->channel_size_mb[0] + ctrl->channel_size_mb[1];
388 
389 	mestolenbase = tom - me_uma_size;
390 
391 	toludbase = MIN(4096 - mmiosize + gfxstolen + gttsize + tsegsize, tom - me_uma_size);
392 
393 	gfxstolenbase = toludbase - gfxstolen;
394 	gttbase = gfxstolenbase - gttsize;
395 
396 	tsegbase = gttbase - tsegsize;
397 
398 	/* Round tsegbase down to nearest address aligned to tsegsize */
399 	tsegbasedelta = tsegbase & (tsegsize - 1);
400 	tsegbase &= ~(tsegsize - 1);
401 
402 	gttbase -= tsegbasedelta;
403 	gfxstolenbase -= tsegbasedelta;
404 	toludbase -= tsegbasedelta;
405 
406 	/* Test if it is possible to reclaim a hole in the RAM addressing */
407 	if (tom - me_uma_size > toludbase) {
408 		/* Reclaim is possible */
409 		reclaim    = 1;
410 		remapbase  = MAX(4096, tom - me_uma_size);
411 		remaplimit = remapbase + MIN(4096, tom - me_uma_size) - toludbase - 1;
412 		touudbase  = remaplimit + 1;
413 	} else {
414 		/* Reclaim not possible */
415 		reclaim   = 0;
416 		touudbase = tom - me_uma_size;
417 	}
418 
419 	/* Update memory map in PCIe configuration space */
420 	printk(BIOS_DEBUG, "Update PCI-E configuration space:\n");
421 
422 	/* TOM (top of memory) */
423 	reg = pci_read_config32(HOST_BRIDGE, TOM);
424 	val = tom & 0xfff;
425 	reg = (reg & ~0xfff00000) | (val << 20);
426 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", TOM, reg);
427 	pci_write_config32(HOST_BRIDGE, TOM, reg);
428 
429 	reg = pci_read_config32(HOST_BRIDGE, TOM + 4);
430 	val = tom & 0xfffff000;
431 	reg = (reg & ~0x000fffff) | (val >> 12);
432 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", TOM + 4, reg);
433 	pci_write_config32(HOST_BRIDGE, TOM + 4, reg);
434 
435 	/* TOLUD (Top Of Low Usable DRAM) */
436 	reg = pci_read_config32(HOST_BRIDGE, TOLUD);
437 	val = toludbase & 0xfff;
438 	reg = (reg & ~0xfff00000) | (val << 20);
439 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", TOLUD, reg);
440 	pci_write_config32(HOST_BRIDGE, TOLUD, reg);
441 
442 	/* TOUUD LSB (Top Of Upper Usable DRAM) */
443 	reg = pci_read_config32(HOST_BRIDGE, TOUUD);
444 	val = touudbase & 0xfff;
445 	reg = (reg & ~0xfff00000) | (val << 20);
446 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", TOUUD, reg);
447 	pci_write_config32(HOST_BRIDGE, TOUUD, reg);
448 
449 	/* TOUUD MSB */
450 	reg = pci_read_config32(HOST_BRIDGE, TOUUD + 4);
451 	val = touudbase & 0xfffff000;
452 	reg = (reg & ~0x000fffff) | (val >> 12);
453 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", TOUUD + 4, reg);
454 	pci_write_config32(HOST_BRIDGE, TOUUD + 4, reg);
455 
456 	if (reclaim) {
457 		/* REMAP BASE */
458 		pci_write_config32(HOST_BRIDGE, REMAPBASE,     remapbase << 20);
459 		pci_write_config32(HOST_BRIDGE, REMAPBASE + 4, remapbase >> 12);
460 
461 		/* REMAP LIMIT */
462 		pci_write_config32(HOST_BRIDGE, REMAPLIMIT,     remaplimit << 20);
463 		pci_write_config32(HOST_BRIDGE, REMAPLIMIT + 4, remaplimit >> 12);
464 	}
465 	/* TSEG */
466 	reg = pci_read_config32(HOST_BRIDGE, TSEGMB);
467 	val = tsegbase & 0xfff;
468 	reg = (reg & ~0xfff00000) | (val << 20);
469 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", TSEGMB, reg);
470 	pci_write_config32(HOST_BRIDGE, TSEGMB, reg);
471 
472 	/* GFX stolen memory */
473 	reg = pci_read_config32(HOST_BRIDGE, BDSM);
474 	val = gfxstolenbase & 0xfff;
475 	reg = (reg & ~0xfff00000) | (val << 20);
476 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", BDSM, reg);
477 	pci_write_config32(HOST_BRIDGE, BDSM, reg);
478 
479 	/* GTT stolen memory */
480 	reg = pci_read_config32(HOST_BRIDGE, BGSM);
481 	val = gttbase & 0xfff;
482 	reg = (reg & ~0xfff00000) | (val << 20);
483 	printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", BGSM, reg);
484 	pci_write_config32(HOST_BRIDGE, BGSM, reg);
485 
486 	if (me_uma_size) {
487 		reg = pci_read_config32(HOST_BRIDGE, MESEG_MASK + 4);
488 		val = (0x80000 - me_uma_size) & 0xfffff000;
489 		reg = (reg & ~0x000fffff) | (val >> 12);
490 		printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", MESEG_MASK + 4, reg);
491 		pci_write_config32(HOST_BRIDGE, MESEG_MASK + 4, reg);
492 
493 		/* ME base */
494 		reg = pci_read_config32(HOST_BRIDGE, MESEG_BASE);
495 		val = mestolenbase & 0xfff;
496 		reg = (reg & ~0xfff00000) | (val << 20);
497 		printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", MESEG_BASE, reg);
498 		pci_write_config32(HOST_BRIDGE, MESEG_BASE, reg);
499 
500 		reg = pci_read_config32(HOST_BRIDGE, MESEG_BASE + 4);
501 		val = mestolenbase & 0xfffff000;
502 		reg = (reg & ~0x000fffff) | (val >> 12);
503 		printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", MESEG_BASE + 4, reg);
504 		pci_write_config32(HOST_BRIDGE, MESEG_BASE + 4, reg);
505 
506 		/* ME mask */
507 		reg = pci_read_config32(HOST_BRIDGE, MESEG_MASK);
508 		val = (0x80000 - me_uma_size) & 0xfff;
509 		reg = (reg & ~0xfff00000) | (val << 20);
510 		reg = reg | ME_STLEN_EN;	/* Set ME memory enable */
511 		reg = reg | MELCK;		/* Set lock bit on ME mem */
512 		printk(BIOS_DEBUG, "PCI(0, 0, 0)[%x] = %x\n", MESEG_MASK, reg);
513 		pci_write_config32(HOST_BRIDGE, MESEG_MASK, reg);
514 	}
515 }
516 
write_reset(ramctr_timing * ctrl)517 static void write_reset(ramctr_timing *ctrl)
518 {
519 	int channel, slotrank;
520 
521 	/* Choose a populated channel */
522 	channel = (ctrl->rankmap[0]) ? 0 : 1;
523 
524 	wait_for_iosav(channel);
525 
526 	/* Choose a populated rank */
527 	slotrank = (ctrl->rankmap[channel] & 1) ? 0 : 2;
528 
529 	iosav_write_zqcs_sequence(channel, slotrank, 3, 8, 0);
530 
531 	/* This is actually using the IOSAV state machine as a timer */
532 	iosav_run_queue(channel, 1, 1);
533 
534 	wait_for_iosav(channel);
535 }
536 
dram_jedecreset(ramctr_timing * ctrl)537 void dram_jedecreset(ramctr_timing *ctrl)
538 {
539 	u32 reg;
540 	int channel;
541 
542 	while (!(mchbar_read32(RCOMP_TIMER) & (1 << 16)))
543 		;
544 	do {
545 		reg = mchbar_read32(IOSAV_STATUS_ch(0));
546 	} while ((reg & 0x14) == 0);
547 
548 	/* Set state of memory controller */
549 	reg = 0x112;
550 	mchbar_write32(MC_INIT_STATE_G, reg);
551 	mchbar_write32(MC_INIT_STATE, 0);
552 	reg |= 2;		/* DDR reset */
553 	mchbar_write32(MC_INIT_STATE_G, reg);
554 
555 	/* Assert DIMM reset signal */
556 	mchbar_clrbits32(MC_INIT_STATE_G, 1 << 1);
557 
558 	/* Wait 200us */
559 	udelay(200);
560 
561 	/* Deassert DIMM reset signal */
562 	mchbar_setbits32(MC_INIT_STATE_G, 1 << 1);
563 
564 	/* Wait 500us */
565 	udelay(500);
566 
567 	/* Enable DCLK */
568 	mchbar_setbits32(MC_INIT_STATE_G, 1 << 2);
569 
570 	/* XXX Wait 20ns */
571 	udelay(1);
572 
573 	FOR_ALL_CHANNELS {
574 		/* Set valid rank CKE */
575 		reg = ctrl->rankmap[channel];
576 		mchbar_write32(MC_INIT_STATE_ch(channel), reg);
577 
578 		/* Wait 10ns for ranks to settle */
579 		// udelay(0.01);
580 
581 		reg = (reg & ~0xf0) | (ctrl->rankmap[channel] << 4);
582 		mchbar_write32(MC_INIT_STATE_ch(channel), reg);
583 
584 		/* Write reset using a NOP */
585 		write_reset(ctrl);
586 	}
587 }
588 
589 /*
590  * DDR3 Rank1 Address mirror swap the following pins:
591  * A3<->A4, A5<->A6, A7<->A8, BA0<->BA1
592  */
ddr3_mirror_mrreg(int * bank,u32 * addr)593 static void ddr3_mirror_mrreg(int *bank, u32 *addr)
594 {
595 	*bank = ((*bank >> 1) & 1) | ((*bank << 1) & 2);
596 	*addr = (*addr & ~0x1f8) | ((*addr >> 1) & 0xa8) | ((*addr & 0xa8) << 1);
597 }
598 
write_mrreg(ramctr_timing * ctrl,int channel,int slotrank,int reg,u32 val)599 static void write_mrreg(ramctr_timing *ctrl, int channel, int slotrank, int reg, u32 val)
600 {
601 	wait_for_iosav(channel);
602 
603 	if (ctrl->rank_mirror[channel][slotrank])
604 		ddr3_mirror_mrreg(®, &val);
605 
606 	const struct iosav_ssq sequence[] = {
607 		/* DRAM command MRS */
608 		[0] = {
609 			.sp_cmd_ctrl = {
610 				.command = IOSAV_MRS,
611 			},
612 			.subseq_ctrl = {
613 				.cmd_executions = 1,
614 				.cmd_delay_gap  = 4,
615 				.post_ssq_wait  = 4,
616 				.data_direction = SSQ_NA,
617 			},
618 			.sp_cmd_addr = {
619 				.address = val,
620 				.rowbits = 6,
621 				.bank    = reg,
622 				.rank    = slotrank,
623 			},
624 		},
625 		/* DRAM command MRS */
626 		[1] = {
627 			.sp_cmd_ctrl = {
628 				.command    = IOSAV_MRS,
629 				.ranksel_ap = 1,
630 			},
631 			.subseq_ctrl = {
632 				.cmd_executions = 1,
633 				.cmd_delay_gap  = 4,
634 				.post_ssq_wait  = 4,
635 				.data_direction = SSQ_NA,
636 			},
637 			.sp_cmd_addr = {
638 				.address = val,
639 				.rowbits = 6,
640 				.bank    = reg,
641 				.rank    = slotrank,
642 			},
643 		},
644 		/* DRAM command MRS */
645 		[2] = {
646 			.sp_cmd_ctrl = {
647 				.command = IOSAV_MRS,
648 			},
649 			.subseq_ctrl = {
650 				.cmd_executions = 1,
651 				.cmd_delay_gap  = 4,
652 				.post_ssq_wait  = ctrl->tMOD,
653 				.data_direction = SSQ_NA,
654 			},
655 			.sp_cmd_addr = {
656 				.address = val,
657 				.rowbits = 6,
658 				.bank    = reg,
659 				.rank    = slotrank,
660 			},
661 		},
662 	};
663 	iosav_write_sequence(channel, sequence, ARRAY_SIZE(sequence));
664 
665 	iosav_run_once_and_wait(channel);
666 }
667 
668 /* Obtain optimal power down mode for current configuration */
get_power_down_mode(ramctr_timing * ctrl,int channel)669 static enum power_down_mode get_power_down_mode(ramctr_timing *ctrl, int channel)
670 {
671 	int slotrank;
672 
673 	if (ctrl->tXP > 8)
674 		return PDM_NONE;
675 
676 	if (ctrl->tXPDLL > 32)
677 		return PDM_PPD;
678 
679 	FOR_ALL_POPULATED_RANKS
680 		if (!ctrl->info.dimm[channel][slotrank >> 1].flags.dll_off_mode)
681 			return PDM_APD_PPD;
682 
683 	if (CONFIG(RAMINIT_ALWAYS_ALLOW_DLL_OFF) || get_platform_type() == PLATFORM_MOBILE)
684 		return PDM_DLL_OFF;
685 
686 	return PDM_APD_PPD;
687 }
688 
make_mr0(ramctr_timing * ctrl,int channel,u8 rank)689 static u32 make_mr0(ramctr_timing *ctrl, int channel, u8 rank)
690 {
691 	u16 mr0reg, mch_cas, mch_wr;
692 	static const u8 mch_wr_t[12] = { 1, 2, 3, 4, 0, 5, 0, 6, 0, 7, 0, 0 };
693 
694 	const enum power_down_mode power_down = get_power_down_mode(ctrl, channel);
695 
696 	const bool slow_exit = power_down == PDM_DLL_OFF || power_down == PDM_APD_DLL_OFF;
697 
698 	/* Convert CAS to MCH register friendly */
699 	if (ctrl->CAS < 12) {
700 		mch_cas = (u16)((ctrl->CAS - 4) << 1);
701 	} else {
702 		mch_cas = (u16)(ctrl->CAS - 12);
703 		mch_cas = ((mch_cas << 1) | 0x1);
704 	}
705 
706 	/* Convert tWR to MCH register friendly */
707 	mch_wr = mch_wr_t[ctrl->tWR - 5];
708 
709 	/* DLL Reset - self clearing - set after CLK frequency has been changed */
710 	mr0reg = 1 << 8;
711 
712 	mr0reg |= (mch_cas & 0x1) << 2;
713 	mr0reg |= (mch_cas & 0xe) << 3;
714 	mr0reg |= mch_wr << 9;
715 
716 	/* Precharge PD - Use slow exit when DLL-off is used - mostly power-saving feature */
717 	mr0reg |= !slow_exit << 12;
718 	return mr0reg;
719 }
720 
dram_mr0(ramctr_timing * ctrl,u8 rank,int channel)721 static void dram_mr0(ramctr_timing *ctrl, u8 rank, int channel)
722 {
723 	write_mrreg(ctrl, channel, rank, 0, make_mr0(ctrl, channel, rank));
724 }
725 
get_ODT(ramctr_timing * ctrl,int channel)726 static odtmap get_ODT(ramctr_timing *ctrl, int channel)
727 {
728 	/* Get ODT based on rankmap */
729 	int dimms_per_ch = (ctrl->rankmap[channel] & 1) + ((ctrl->rankmap[channel] >> 2) & 1);
730 
731 	if (dimms_per_ch == 1) {
732 		return (const odtmap){60,  60};
733 	} else {
734 		return (const odtmap){120, 30};
735 	}
736 }
737 
encode_odt(u32 odt)738 static u32 encode_odt(u32 odt)
739 {
740 	switch (odt) {
741 	case 30:
742 		return (1 << 9) | (1 << 2);	/* RZQ/8, RZQ/4 */
743 	case 60:
744 		return (1 << 2);	/* RZQ/4 */
745 	case 120:
746 		return (1 << 6);	/* RZQ/2 */
747 	default:
748 	case 0:
749 		return 0;
750 	}
751 }
752 
make_mr1(ramctr_timing * ctrl,u8 rank,int channel)753 static u32 make_mr1(ramctr_timing *ctrl, u8 rank, int channel)
754 {
755 	odtmap odt;
756 	u32 mr1reg;
757 
758 	odt = get_ODT(ctrl, channel);
759 	mr1reg = 2;
760 
761 	mr1reg |= encode_odt(odt.rttnom);
762 
763 	return mr1reg;
764 }
765 
dram_mr1(ramctr_timing * ctrl,u8 rank,int channel)766 static void dram_mr1(ramctr_timing *ctrl, u8 rank, int channel)
767 {
768 	u16 mr1reg;
769 
770 	mr1reg = make_mr1(ctrl, rank, channel);
771 
772 	write_mrreg(ctrl, channel, rank, 1, mr1reg);
773 }
774 
dram_mr2(ramctr_timing * ctrl,u8 rank,int channel)775 static void dram_mr2(ramctr_timing *ctrl, u8 rank, int channel)
776 {
777 	const u16 pasr = 0;
778 	const u16 cwl = ctrl->CWL - 5;
779 	const odtmap odt = get_ODT(ctrl, channel);
780 
781 	int srt = 0;
782 	if (IS_IVY_CPU(ctrl->cpu) && ctrl->tCK >= TCK_1066MHZ)
783 		srt = ctrl->extended_temperature_range && !ctrl->auto_self_refresh;
784 
785 	u16 mr2reg = 0;
786 	mr2reg |= pasr;
787 	mr2reg |= cwl << 3;
788 	mr2reg |= ctrl->auto_self_refresh << 6;
789 	mr2reg |= srt << 7;
790 	mr2reg |= (odt.rttwr / 60) << 9;
791 
792 	write_mrreg(ctrl, channel, rank, 2, mr2reg);
793 
794 	/* Program MR2 shadow */
795 	u32 reg32 = mchbar_read32(TC_MR2_SHADOW_ch(channel));
796 
797 	reg32 &= 3 << 14 | 3 << 6;
798 
799 	reg32 |= mr2reg & ~(3 << 6);
800 
801 	if (srt)
802 		reg32 |= 1 << (rank / 2 + 6);
803 
804 	if (ctrl->rank_mirror[channel][rank])
805 		reg32 |= 1 << (rank / 2 + 14);
806 
807 	mchbar_write32(TC_MR2_SHADOW_ch(channel), reg32);
808 }
809 
dram_mr3(ramctr_timing * ctrl,u8 rank,int channel)810 static void dram_mr3(ramctr_timing *ctrl, u8 rank, int channel)
811 {
812 	write_mrreg(ctrl, channel, rank, 3, 0);
813 }
814 
dram_mrscommands(ramctr_timing * ctrl)815 void dram_mrscommands(ramctr_timing *ctrl)
816 {
817 	u8 slotrank;
818 	int channel;
819 
820 	FOR_ALL_POPULATED_CHANNELS {
821 		FOR_ALL_POPULATED_RANKS {
822 			/* MR2 */
823 			dram_mr2(ctrl, slotrank, channel);
824 
825 			/* MR3 */
826 			dram_mr3(ctrl, slotrank, channel);
827 
828 			/* MR1 */
829 			dram_mr1(ctrl, slotrank, channel);
830 
831 			/* MR0 */
832 			dram_mr0(ctrl, slotrank, channel);
833 		}
834 	}
835 
836 	const struct iosav_ssq zqcl_sequence[] = {
837 		/* DRAM command NOP (without ODT nor chip selects) */
838 		[0] = {
839 			.sp_cmd_ctrl = {
840 				.command = IOSAV_NOP & ~(0xff << 8),
841 			},
842 			.subseq_ctrl = {
843 				.cmd_executions = 1,
844 				.cmd_delay_gap  = 4,
845 				.post_ssq_wait  = 15,
846 				.data_direction = SSQ_NA,
847 			},
848 			.sp_cmd_addr = {
849 				.address = 2,
850 				.rowbits = 6,
851 				.bank    = 0,
852 				.rank    = 0,
853 			},
854 		},
855 		/* DRAM command ZQCL */
856 		[1] = {
857 			.sp_cmd_ctrl = {
858 				.command    = IOSAV_ZQCS,
859 				.ranksel_ap = 1,
860 			},
861 			.subseq_ctrl = {
862 				.cmd_executions = 1,
863 				.cmd_delay_gap  = 4,
864 				.post_ssq_wait  = 400,
865 				.data_direction = SSQ_NA,
866 			},
867 			.sp_cmd_addr = {
868 				.address = 1 << 10,
869 				.rowbits = 6,
870 				.bank    = 0,
871 				.rank    = 0,
872 			},
873 			.addr_update = {
874 				.inc_rank  = 1,
875 				.addr_wrap = 20,
876 			},
877 		},
878 	};
879 	iosav_write_sequence(BROADCAST_CH, zqcl_sequence, ARRAY_SIZE(zqcl_sequence));
880 
881 	iosav_run_queue(BROADCAST_CH, 4, 0);
882 
883 	FOR_ALL_CHANNELS {
884 		wait_for_iosav(channel);
885 	}
886 
887 	/* Refresh enable */
888 	mchbar_setbits32(MC_INIT_STATE_G, 1 << 3);
889 
890 	FOR_ALL_POPULATED_CHANNELS {
891 		mchbar_clrbits32(SCHED_CBIT_ch(channel), 1 << 21);
892 
893 		wait_for_iosav(channel);
894 
895 		slotrank = (ctrl->rankmap[channel] & 1) ? 0 : 2;
896 
897 		wait_for_iosav(channel);
898 
899 		iosav_write_zqcs_sequence(channel, slotrank, 4, 101, 31);
900 
901 		iosav_run_once_and_wait(channel);
902 	}
903 }
904 
905 static const u32 lane_base[] = {
906 	LANEBASE_B0, LANEBASE_B1, LANEBASE_B2, LANEBASE_B3,
907 	LANEBASE_B4, LANEBASE_B5, LANEBASE_B6, LANEBASE_B7,
908 	LANEBASE_ECC
909 };
910 
911 /* Maximum delay for command, control, clock */
912 #define CCC_MAX_PI	(2 * QCLK_PI - 1)
913 
program_timings(ramctr_timing * ctrl,int channel)914 void program_timings(ramctr_timing *ctrl, int channel)
915 {
916 	u32 reg_roundtrip_latency, reg_io_latency;
917 	int lane;
918 	int slotrank, slot;
919 
920 	u32 ctl_delay[NUM_SLOTS] = { 0 };
921 	int cmd_delay = 0;
922 
923 	/* Enable CLK XOVER */
924 	u32 clk_pi_coding = get_XOVER_CLK(ctrl->rankmap[channel]);
925 	u32 clk_logic_dly = 0;
926 
927 	/*
928 	 * Compute command timing as abs() of the most negative PI code
929 	 * across all ranks. Use zero if none of the values is negative.
930 	 */
931 	FOR_ALL_POPULATED_RANKS {
932 		cmd_delay = MAX(cmd_delay, -ctrl->timings[channel][slotrank].pi_coding);
933 	}
934 	if (cmd_delay > CCC_MAX_PI) {
935 		printk(BIOS_ERR, "C%d command delay overflow: %d\n", channel, cmd_delay);
936 		cmd_delay = CCC_MAX_PI;
937 	}
938 
939 	for (slot = 0; slot < NUM_SLOTS; slot++) {
940 		const int pi_coding_0 = ctrl->timings[channel][2 * slot + 0].pi_coding;
941 		const int pi_coding_1 = ctrl->timings[channel][2 * slot + 1].pi_coding;
942 
943 		const u8 slot_map = (ctrl->rankmap[channel] >> (2 * slot)) & 3;
944 
945 		if (slot_map & 1)
946 			ctl_delay[slot] += pi_coding_0 + cmd_delay;
947 
948 		if (slot_map & 2)
949 			ctl_delay[slot] += pi_coding_1 + cmd_delay;
950 
951 		/* If both ranks in a slot are populated, use the average */
952 		if (slot_map == 3)
953 			ctl_delay[slot] /= 2;
954 
955 		if (ctl_delay[slot] > CCC_MAX_PI) {
956 			printk(BIOS_ERR, "C%dS%d control delay overflow: %d\n",
957 				channel, slot, ctl_delay[slot]);
958 			ctl_delay[slot] = CCC_MAX_PI;
959 		}
960 	}
961 	FOR_ALL_POPULATED_RANKS {
962 		int clk_delay = ctrl->timings[channel][slotrank].pi_coding + cmd_delay;
963 
964 		/*
965 		 * Clock is a differential signal, whereas command and control are not.
966 		 * This affects its timing, and it is also why it needs a magic offset.
967 		 */
968 		clk_delay += ctrl->pi_code_offset;
969 
970 		/* Can never happen with valid values */
971 		if (clk_delay < 0) {
972 			printk(BIOS_ERR, "C%dR%d clock delay underflow: %d\n",
973 				channel, slotrank, clk_delay);
974 			clk_delay = 0;
975 		}
976 
977 		/* Clock can safely wrap around because it is a periodic signal */
978 		clk_delay %= CCC_MAX_PI + 1;
979 
980 		clk_pi_coding |= (clk_delay % QCLK_PI) << (6 * slotrank);
981 		clk_logic_dly |= (clk_delay / QCLK_PI) << slotrank;
982 	}
983 
984 	/* Enable CMD XOVER */
985 	union gdcr_cmd_pi_coding_reg cmd_pi_coding = {
986 		.raw = get_XOVER_CMD(ctrl->rankmap[channel]),
987 	};
988 	cmd_pi_coding.cmd_pi_code = cmd_delay % QCLK_PI;
989 	cmd_pi_coding.cmd_logic_delay = cmd_delay / QCLK_PI;
990 
991 	cmd_pi_coding.ctl_pi_code_d0 = ctl_delay[0] % QCLK_PI;
992 	cmd_pi_coding.ctl_pi_code_d1 = ctl_delay[1] % QCLK_PI;
993 	cmd_pi_coding.ctl_logic_delay_d0 = ctl_delay[0] / QCLK_PI;
994 	cmd_pi_coding.ctl_logic_delay_d1 = ctl_delay[1] / QCLK_PI;
995 
996 	mchbar_write32(GDCRCMDPICODING_ch(channel), cmd_pi_coding.raw);
997 
998 	mchbar_write32(GDCRCKPICODE_ch(channel), clk_pi_coding);
999 	mchbar_write32(GDCRCKLOGICDELAY_ch(channel), clk_logic_dly);
1000 
1001 	reg_io_latency = mchbar_read32(SC_IO_LATENCY_ch(channel));
1002 	reg_io_latency &= ~0xffff;
1003 
1004 	reg_roundtrip_latency = 0;
1005 
1006 	FOR_ALL_POPULATED_RANKS {
1007 		reg_io_latency |= ctrl->timings[channel][slotrank].io_latency << (4 * slotrank);
1008 
1009 		reg_roundtrip_latency |=
1010 		    ctrl->timings[channel][slotrank].roundtrip_latency << (8 * slotrank);
1011 
1012 		FOR_ALL_LANES {
1013 			const u16 rcven = ctrl->timings[channel][slotrank].lanes[lane].rcven;
1014 			const u8 dqs_p = ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_p;
1015 			const u8 dqs_n = ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_n;
1016 			const union gdcr_rx_reg gdcr_rx = {
1017 				.rcven_pi_code     = rcven % QCLK_PI,
1018 				.rx_dqs_p_pi_code  = dqs_p,
1019 				.rcven_logic_delay = rcven / QCLK_PI,
1020 				.rx_dqs_n_pi_code  = dqs_n,
1021 			};
1022 			mchbar_write32(lane_base[lane] + GDCRRX(channel, slotrank),
1023 				gdcr_rx.raw);
1024 
1025 			const u16 tx_dqs = ctrl->timings[channel][slotrank].lanes[lane].tx_dqs;
1026 			const int tx_dq = ctrl->timings[channel][slotrank].lanes[lane].tx_dq;
1027 			const union gdcr_tx_reg gdcr_tx = {
1028 				.tx_dq_pi_code      = tx_dq % QCLK_PI,
1029 				.tx_dqs_pi_code     = tx_dqs % QCLK_PI,
1030 				.tx_dqs_logic_delay = tx_dqs / QCLK_PI,
1031 				.tx_dq_logic_delay  = tx_dq / QCLK_PI,
1032 			};
1033 			mchbar_write32(lane_base[lane] + GDCRTX(channel, slotrank),
1034 				gdcr_tx.raw);
1035 		}
1036 	}
1037 	mchbar_write32(SC_ROUNDT_LAT_ch(channel), reg_roundtrip_latency);
1038 	mchbar_write32(SC_IO_LATENCY_ch(channel), reg_io_latency);
1039 }
1040 
test_rcven(ramctr_timing * ctrl,int channel,int slotrank)1041 static void test_rcven(ramctr_timing *ctrl, int channel, int slotrank)
1042 {
1043 	wait_for_iosav(channel);
1044 
1045 	/* Send a burst of 16 back-to-back read commands (4 DCLK apart) */
1046 	iosav_write_read_mpr_sequence(channel, slotrank, ctrl->tMOD, 1, 3, 15, ctrl->CAS + 36);
1047 
1048 	iosav_run_once_and_wait(channel);
1049 }
1050 
does_lane_work(ramctr_timing * ctrl,int channel,int slotrank,int lane)1051 static int does_lane_work(ramctr_timing *ctrl, int channel, int slotrank, int lane)
1052 {
1053 	u32 rcven = ctrl->timings[channel][slotrank].lanes[lane].rcven;
1054 
1055 	return (mchbar_read32(lane_base[lane] +
1056 		GDCRTRAININGRESULT(channel, (rcven / 32) & 1)) >> (rcven % 32)) & 1;
1057 }
1058 
1059 struct run {
1060 	int middle;
1061 	int end;
1062 	int start;
1063 	int all;
1064 	int length;
1065 };
1066 
get_longest_zero_run(int * seq,int sz)1067 static struct run get_longest_zero_run(int *seq, int sz)
1068 {
1069 	int i, ls;
1070 	int bl = 0, bs = 0;
1071 	struct run ret;
1072 
1073 	ls = 0;
1074 	for (i = 0; i < 2 * sz; i++)
1075 		if (seq[i % sz]) {
1076 			if (i - ls > bl) {
1077 				bl = i - ls;
1078 				bs = ls;
1079 			}
1080 			ls = i + 1;
1081 		}
1082 	if (bl == 0) {
1083 		ret.middle = sz / 2;
1084 		ret.start  = 0;
1085 		ret.end    = sz;
1086 		ret.length = sz;
1087 		ret.all    = 1;
1088 		return ret;
1089 	}
1090 
1091 	ret.start  = bs % sz;
1092 	ret.end    = (bs + bl - 1) % sz;
1093 	ret.middle = (bs + (bl - 1) / 2) % sz;
1094 	ret.length = bl;
1095 	ret.all    = 0;
1096 
1097 	return ret;
1098 }
1099 
1100 #define RCVEN_COARSE_PI_LENGTH	(2 * QCLK_PI)
1101 
find_rcven_pi_coarse(ramctr_timing * ctrl,int channel,int slotrank,int * upperA)1102 static void find_rcven_pi_coarse(ramctr_timing *ctrl, int channel, int slotrank, int *upperA)
1103 {
1104 	int rcven;
1105 	int statistics[NUM_LANES][RCVEN_COARSE_PI_LENGTH];
1106 	int lane;
1107 
1108 	for (rcven = 0; rcven < RCVEN_COARSE_PI_LENGTH; rcven++) {
1109 		FOR_ALL_LANES {
1110 			ctrl->timings[channel][slotrank].lanes[lane].rcven = rcven;
1111 		}
1112 		program_timings(ctrl, channel);
1113 
1114 		test_rcven(ctrl, channel, slotrank);
1115 
1116 		FOR_ALL_LANES {
1117 			statistics[lane][rcven] =
1118 				!does_lane_work(ctrl, channel, slotrank, lane);
1119 		}
1120 	}
1121 	FOR_ALL_LANES {
1122 		struct run rn = get_longest_zero_run(statistics[lane], RCVEN_COARSE_PI_LENGTH);
1123 		ctrl->timings[channel][slotrank].lanes[lane].rcven = rn.middle;
1124 		upperA[lane] = rn.end;
1125 		if (upperA[lane] < rn.middle)
1126 			upperA[lane] += 2 * QCLK_PI;
1127 
1128 		printram("rcven: %d, %d, %d: % 4d-% 4d-% 4d\n",
1129 			 channel, slotrank, lane, rn.start, rn.middle, rn.end);
1130 	}
1131 }
1132 
fine_tune_rcven_pi(ramctr_timing * ctrl,int channel,int slotrank,int * upperA)1133 static void fine_tune_rcven_pi(ramctr_timing *ctrl, int channel, int slotrank, int *upperA)
1134 {
1135 	int rcven_delta;
1136 	int statistics[NUM_LANES][51] = {0};
1137 	int lane, i;
1138 
1139 	for (rcven_delta = -25; rcven_delta <= 25; rcven_delta++) {
1140 		FOR_ALL_LANES {
1141 			ctrl->timings[channel][slotrank].lanes[lane].rcven
1142 				= upperA[lane] + rcven_delta + QCLK_PI;
1143 		}
1144 		program_timings(ctrl, channel);
1145 
1146 		for (i = 0; i < 100; i++) {
1147 			test_rcven(ctrl, channel, slotrank);
1148 			FOR_ALL_LANES {
1149 				statistics[lane][rcven_delta + 25] +=
1150 					does_lane_work(ctrl, channel, slotrank, lane);
1151 			}
1152 		}
1153 	}
1154 	FOR_ALL_LANES {
1155 		int last_zero, first_all;
1156 
1157 		for (last_zero = -25; last_zero <= 25; last_zero++)
1158 			if (statistics[lane][last_zero + 25])
1159 				break;
1160 
1161 		last_zero--;
1162 		for (first_all = -25; first_all <= 25; first_all++)
1163 			if (statistics[lane][first_all + 25] == 100)
1164 				break;
1165 
1166 		printram("lane %d: %d, %d\n", lane, last_zero, first_all);
1167 
1168 		ctrl->timings[channel][slotrank].lanes[lane].rcven =
1169 			(last_zero + first_all) / 2 + upperA[lane];
1170 
1171 		printram("Aval: %d, %d, %d: % 4d\n", channel, slotrank,
1172 			lane, ctrl->timings[channel][slotrank].lanes[lane].rcven);
1173 	}
1174 }
1175 
1176 /*
1177  * Once the DQS high phase has been found (for each DRAM) the next stage
1178  * is to find out the round trip latency, by locating the preamble cycle.
1179  * This is achieved by trying smaller and smaller roundtrip values until
1180  * the strobe sampling is done on the preamble cycle.
1181  */
find_roundtrip_latency(ramctr_timing * ctrl,int channel,int slotrank,int * upperA)1182 static int find_roundtrip_latency(ramctr_timing *ctrl, int channel, int slotrank, int *upperA)
1183 {
1184 	int works[NUM_LANES];
1185 	int lane;
1186 
1187 	while (1) {
1188 		int all_works = 1, some_works = 0;
1189 
1190 		program_timings(ctrl, channel);
1191 		test_rcven(ctrl, channel, slotrank);
1192 
1193 		FOR_ALL_LANES {
1194 			works[lane] = !does_lane_work(ctrl, channel, slotrank, lane);
1195 
1196 			if (works[lane])
1197 				some_works = 1;
1198 			else
1199 				all_works = 0;
1200 		}
1201 
1202 		/* If every lane is working, exit */
1203 		if (all_works)
1204 			return 0;
1205 
1206 		/*
1207 		 * If all bits are one (everyone is failing), decrement
1208 		 * the roundtrip value by two, and do another iteration.
1209 		 */
1210 		if (!some_works) {
1211 			/* Guard against roundtrip latency underflow */
1212 			if (ctrl->timings[channel][slotrank].roundtrip_latency < 2) {
1213 				printk(BIOS_EMERG, "Roundtrip latency underflow: %d, %d\n",
1214 				       channel, slotrank);
1215 				return MAKE_ERR;
1216 			}
1217 			ctrl->timings[channel][slotrank].roundtrip_latency -= 2;
1218 			printram("4024 -= 2;\n");
1219 			continue;
1220 		}
1221 
1222 		/*
1223 		 * Else (if some lanes are failing), increase the rank's
1224 		 * I/O latency by 2, and increase rcven logic delay by 2
1225 		 * on the working lanes, then perform another iteration.
1226 		 */
1227 		ctrl->timings[channel][slotrank].io_latency += 2;
1228 		printram("4028 += 2;\n");
1229 
1230 		/* Guard against I/O latency overflow */
1231 		if (ctrl->timings[channel][slotrank].io_latency >= 16) {
1232 			printk(BIOS_EMERG, "I/O latency overflow: %d, %d\n",
1233 			       channel, slotrank);
1234 			return MAKE_ERR;
1235 		}
1236 		FOR_ALL_LANES if (works[lane]) {
1237 			ctrl->timings[channel][slotrank].lanes[lane].rcven += 2 * QCLK_PI;
1238 			upperA[lane] += 2 * QCLK_PI;
1239 			printram("increment %d, %d, %d\n", channel, slotrank, lane);
1240 		}
1241 	}
1242 	return 0;
1243 }
1244 
get_logic_delay_delta(ramctr_timing * ctrl,int channel,int slotrank)1245 static int get_logic_delay_delta(ramctr_timing *ctrl, int channel, int slotrank)
1246 {
1247 	int lane;
1248 	u16 logic_delay_min = 7;
1249 	u16 logic_delay_max = 0;
1250 
1251 	FOR_ALL_LANES {
1252 		const u16 logic_delay = ctrl->timings[channel][slotrank].lanes[lane].rcven >> 6;
1253 
1254 		logic_delay_min = MIN(logic_delay_min, logic_delay);
1255 		logic_delay_max = MAX(logic_delay_max, logic_delay);
1256 	}
1257 
1258 	if (logic_delay_max < logic_delay_min) {
1259 		printk(BIOS_EMERG, "Logic delay max < min (%u < %u): %d, %d\n",
1260 		       logic_delay_max, logic_delay_min, channel, slotrank);
1261 	}
1262 
1263 	assert(logic_delay_max >= logic_delay_min);
1264 
1265 	return logic_delay_max - logic_delay_min;
1266 }
1267 
align_rt_io_latency(ramctr_timing * ctrl,int channel,int slotrank,int prev)1268 static int align_rt_io_latency(ramctr_timing *ctrl, int channel, int slotrank, int prev)
1269 {
1270 	int latency_offset = 0;
1271 
1272 	/* Get changed maxima */
1273 	const int post = get_logic_delay_delta(ctrl, channel, slotrank);
1274 
1275 	if (prev < post)
1276 		latency_offset = +1;
1277 
1278 	else if (prev > post)
1279 		latency_offset = -1;
1280 
1281 	else
1282 		latency_offset = 0;
1283 
1284 	ctrl->timings[channel][slotrank].io_latency += latency_offset;
1285 	ctrl->timings[channel][slotrank].roundtrip_latency += latency_offset;
1286 	printram("4024 += %d;\n", latency_offset);
1287 	printram("4028 += %d;\n", latency_offset);
1288 
1289 	return post;
1290 }
1291 
compute_final_logic_delay(ramctr_timing * ctrl,int channel,int slotrank)1292 static void compute_final_logic_delay(ramctr_timing *ctrl, int channel, int slotrank)
1293 {
1294 	u16 logic_delay_min = 7;
1295 	int lane;
1296 
1297 	FOR_ALL_LANES {
1298 		const u16 logic_delay = ctrl->timings[channel][slotrank].lanes[lane].rcven >> 6;
1299 
1300 		logic_delay_min = MIN(logic_delay_min, logic_delay);
1301 	}
1302 
1303 	if (logic_delay_min >= 2) {
1304 		printk(BIOS_WARNING, "Logic delay %u greater than 1: %d %d\n",
1305 			logic_delay_min, channel, slotrank);
1306 	}
1307 
1308 	FOR_ALL_LANES {
1309 		ctrl->timings[channel][slotrank].lanes[lane].rcven -= logic_delay_min << 6;
1310 	}
1311 	ctrl->timings[channel][slotrank].io_latency -= logic_delay_min;
1312 	printram("4028 -= %d;\n", logic_delay_min);
1313 }
1314 
receive_enable_calibration(ramctr_timing * ctrl)1315 int receive_enable_calibration(ramctr_timing *ctrl)
1316 {
1317 	int channel, slotrank, lane;
1318 	int err;
1319 
1320 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
1321 		int all_high, some_high;
1322 		int upperA[NUM_LANES];
1323 		int prev;
1324 
1325 		wait_for_iosav(channel);
1326 
1327 		iosav_write_prea_sequence(channel, slotrank, ctrl->tRP, 0);
1328 
1329 		iosav_run_once_and_wait(channel);
1330 
1331 		const union gdcr_training_mod_reg training_mod = {
1332 			.receive_enable_mode = 1,
1333 			.training_rank_sel   = slotrank,
1334 			.odt_always_on       = 1,
1335 		};
1336 		mchbar_write32(GDCRTRAININGMOD, training_mod.raw);
1337 
1338 		ctrl->timings[channel][slotrank].io_latency = 4;
1339 		ctrl->timings[channel][slotrank].roundtrip_latency = 55;
1340 		program_timings(ctrl, channel);
1341 
1342 		find_rcven_pi_coarse(ctrl, channel, slotrank, upperA);
1343 
1344 		all_high = 1;
1345 		some_high = 0;
1346 		FOR_ALL_LANES {
1347 			if (ctrl->timings[channel][slotrank].lanes[lane].rcven >= QCLK_PI)
1348 				some_high = 1;
1349 			else
1350 				all_high = 0;
1351 		}
1352 
1353 		if (all_high) {
1354 			ctrl->timings[channel][slotrank].io_latency--;
1355 			printram("4028--;\n");
1356 			FOR_ALL_LANES {
1357 				ctrl->timings[channel][slotrank].lanes[lane].rcven -= QCLK_PI;
1358 				upperA[lane] -= QCLK_PI;
1359 			}
1360 		} else if (some_high) {
1361 			ctrl->timings[channel][slotrank].roundtrip_latency++;
1362 			ctrl->timings[channel][slotrank].io_latency++;
1363 			printram("4024++;\n");
1364 			printram("4028++;\n");
1365 		}
1366 
1367 		program_timings(ctrl, channel);
1368 
1369 		prev = get_logic_delay_delta(ctrl, channel, slotrank);
1370 
1371 		err = find_roundtrip_latency(ctrl, channel, slotrank, upperA);
1372 		if (err)
1373 			return err;
1374 
1375 		prev = align_rt_io_latency(ctrl, channel, slotrank, prev);
1376 
1377 		fine_tune_rcven_pi(ctrl, channel, slotrank, upperA);
1378 
1379 		prev = align_rt_io_latency(ctrl, channel, slotrank, prev);
1380 
1381 		compute_final_logic_delay(ctrl, channel, slotrank);
1382 
1383 		align_rt_io_latency(ctrl, channel, slotrank, prev);
1384 
1385 		printram("4/8: %d, %d, % 4d, % 4d\n", channel, slotrank,
1386 		       ctrl->timings[channel][slotrank].roundtrip_latency,
1387 		       ctrl->timings[channel][slotrank].io_latency);
1388 
1389 		printram("final results:\n");
1390 		FOR_ALL_LANES
1391 			printram("Aval: %d, %d, %d: % 4d\n", channel, slotrank, lane,
1392 			    ctrl->timings[channel][slotrank].lanes[lane].rcven);
1393 
1394 		mchbar_write32(GDCRTRAININGMOD, 0);
1395 
1396 		toggle_io_reset();
1397 	}
1398 
1399 	FOR_ALL_POPULATED_CHANNELS {
1400 		program_timings(ctrl, channel);
1401 	}
1402 
1403 	return 0;
1404 }
1405 
test_tx_dq(ramctr_timing * ctrl,int channel,int slotrank)1406 static void test_tx_dq(ramctr_timing *ctrl, int channel, int slotrank)
1407 {
1408 	int lane;
1409 
1410 	FOR_ALL_LANES {
1411 		mchbar_write32(IOSAV_By_ERROR_COUNT_ch(channel, lane), 0);
1412 		mchbar_read32(IOSAV_By_BW_SERROR_C_ch(channel, lane));
1413 	}
1414 
1415 	wait_for_iosav(channel);
1416 
1417 	iosav_write_misc_write_sequence(ctrl, channel, slotrank,
1418 		MAX(ctrl->tRRD, (ctrl->tFAW >> 2) + 1), 4, 4, 500, 18);
1419 
1420 	iosav_run_once_and_wait(channel);
1421 
1422 	iosav_write_prea_act_read_sequence(ctrl, channel, slotrank);
1423 
1424 	iosav_run_once_and_wait(channel);
1425 }
1426 
tx_dq_threshold_process(int * data,const int count)1427 static void tx_dq_threshold_process(int *data, const int count)
1428 {
1429 	int min = data[0];
1430 	int max = min;
1431 	int i;
1432 	for (i = 1; i < count; i++) {
1433 		if (min > data[i])
1434 			min = data[i];
1435 
1436 		if (max < data[i])
1437 			max = data[i];
1438 	}
1439 	int threshold = min / 2 + max / 2;
1440 	for (i = 0; i < count; i++)
1441 		data[i] = data[i] > threshold;
1442 
1443 	printram("threshold=%d min=%d max=%d\n", threshold, min, max);
1444 }
1445 
tx_dq_write_leveling(ramctr_timing * ctrl,int channel,int slotrank)1446 static int tx_dq_write_leveling(ramctr_timing *ctrl, int channel, int slotrank)
1447 {
1448 	int tx_dq;
1449 	int stats[NUM_LANES][MAX_TX_DQ + 1];
1450 	int lane;
1451 
1452 	wait_for_iosav(channel);
1453 
1454 	iosav_write_prea_sequence(channel, slotrank, ctrl->tRP, 18);
1455 
1456 	iosav_run_once_and_wait(channel);
1457 
1458 	for (tx_dq = 0; tx_dq <= MAX_TX_DQ; tx_dq++) {
1459 		FOR_ALL_LANES ctrl->timings[channel][slotrank].lanes[lane].tx_dq = tx_dq;
1460 		program_timings(ctrl, channel);
1461 
1462 		test_tx_dq(ctrl, channel, slotrank);
1463 
1464 		FOR_ALL_LANES {
1465 			stats[lane][tx_dq] = mchbar_read32(
1466 				IOSAV_By_ERROR_COUNT_ch(channel, lane));
1467 		}
1468 	}
1469 	FOR_ALL_LANES {
1470 		struct run rn = get_longest_zero_run(stats[lane], ARRAY_SIZE(stats[lane]));
1471 
1472 		if (rn.all || rn.length < 8) {
1473 			printk(BIOS_EMERG, "tx_dq write leveling failed: %d, %d, %d\n",
1474 			       channel, slotrank, lane);
1475 			/*
1476 			 * With command training not being done yet, the lane can be erroneous.
1477 			 * Take the average as reference and try again to find a run.
1478 			 */
1479 			tx_dq_threshold_process(stats[lane], ARRAY_SIZE(stats[lane]));
1480 			rn = get_longest_zero_run(stats[lane], ARRAY_SIZE(stats[lane]));
1481 
1482 			if (rn.all || rn.length < 8) {
1483 				printk(BIOS_EMERG, "tx_dq recovery failed\n");
1484 				return MAKE_ERR;
1485 			}
1486 		}
1487 		ctrl->timings[channel][slotrank].lanes[lane].tx_dq = rn.middle;
1488 		printram("tx_dq: %d, %d, %d: % 4d-% 4d-% 4d\n",
1489 			channel, slotrank, lane, rn.start, rn.middle, rn.end);
1490 	}
1491 	return 0;
1492 }
1493 
get_precedening_channels(ramctr_timing * ctrl,int target_channel)1494 static int get_precedening_channels(ramctr_timing *ctrl, int target_channel)
1495 {
1496 	int channel, ret = 0;
1497 
1498 	FOR_ALL_POPULATED_CHANNELS if (channel < target_channel)
1499 		 ret++;
1500 
1501 	return ret;
1502 }
1503 
1504 /* Each cacheline is 64 bits long */
program_wdb_pattern_length(int channel,const unsigned int num_cachelines)1505 static void program_wdb_pattern_length(int channel, const unsigned int num_cachelines)
1506 {
1507 	mchbar_write8(IOSAV_DATA_CTL_ch(channel), num_cachelines / 8 - 1);
1508 }
1509 
fill_pattern0(ramctr_timing * ctrl,int channel,u32 a,u32 b)1510 static void fill_pattern0(ramctr_timing *ctrl, int channel, u32 a, u32 b)
1511 {
1512 	unsigned int j;
1513 	unsigned int channel_offset = get_precedening_channels(ctrl, channel) * 64;
1514 	uintptr_t addr;
1515 
1516 	for (j = 0; j < 16; j++) {
1517 		addr = 0x04000000 + channel_offset + 4 * j;
1518 		write32p(addr, j & 2 ? b : a);
1519 	}
1520 
1521 	sfence();
1522 
1523 	program_wdb_pattern_length(channel, 8);
1524 }
1525 
num_of_channels(const ramctr_timing * ctrl)1526 static int num_of_channels(const ramctr_timing *ctrl)
1527 {
1528 	int ret = 0;
1529 	int channel;
1530 	FOR_ALL_POPULATED_CHANNELS ret++;
1531 	return ret;
1532 }
1533 
fill_pattern1(ramctr_timing * ctrl,int channel)1534 static void fill_pattern1(ramctr_timing *ctrl, int channel)
1535 {
1536 	unsigned int j;
1537 	unsigned int channel_offset = get_precedening_channels(ctrl, channel) * 64;
1538 	unsigned int channel_step = 64 * num_of_channels(ctrl);
1539 	uintptr_t addr;
1540 
1541 	for (j = 0; j < 16; j++) {
1542 		addr = 0x04000000 + channel_offset + j * 4;
1543 		write32p(addr, 0xffffffff);
1544 	}
1545 	for (j = 0; j < 16; j++) {
1546 		addr = 0x04000000 + channel_offset + channel_step + j * 4;
1547 		write32p(addr, 0);
1548 	}
1549 	sfence();
1550 
1551 	program_wdb_pattern_length(channel, 16);
1552 }
1553 
1554 #define TX_DQS_PI_LENGTH	(2 * QCLK_PI)
1555 
write_level_rank(ramctr_timing * ctrl,int channel,int slotrank)1556 static int write_level_rank(ramctr_timing *ctrl, int channel, int slotrank)
1557 {
1558 	int tx_dqs;
1559 	int statistics[NUM_LANES][TX_DQS_PI_LENGTH];
1560 	int lane;
1561 
1562 	const union gdcr_training_mod_reg training_mod = {
1563 		.write_leveling_mode = 1,
1564 		.training_rank_sel   = slotrank,
1565 		.enable_dqs_wl       = 5,
1566 		.odt_always_on       = 1,
1567 		.force_drive_enable  = 1,
1568 	};
1569 	mchbar_write32(GDCRTRAININGMOD, training_mod.raw);
1570 
1571 	u32 mr1reg = make_mr1(ctrl, slotrank, channel) | 1 << 7;
1572 	int bank = 1;
1573 
1574 	if (ctrl->rank_mirror[channel][slotrank])
1575 		ddr3_mirror_mrreg(&bank, &mr1reg);
1576 
1577 	wait_for_iosav(channel);
1578 
1579 	iosav_write_jedec_write_leveling_sequence(ctrl, channel, slotrank, bank, mr1reg);
1580 
1581 	for (tx_dqs = 0; tx_dqs < TX_DQS_PI_LENGTH; tx_dqs++) {
1582 		FOR_ALL_LANES {
1583 			ctrl->timings[channel][slotrank].lanes[lane].tx_dqs = tx_dqs;
1584 		}
1585 		program_timings(ctrl, channel);
1586 
1587 		iosav_run_once_and_wait(channel);
1588 
1589 		FOR_ALL_LANES {
1590 			statistics[lane][tx_dqs] =  !((mchbar_read32(lane_base[lane] +
1591 				GDCRTRAININGRESULT(channel, (tx_dqs / 32) & 1)) >>
1592 				(tx_dqs % 32)) & 1);
1593 		}
1594 	}
1595 	FOR_ALL_LANES {
1596 		struct run rn = get_longest_zero_run(statistics[lane], TX_DQS_PI_LENGTH);
1597 		/*
1598 		 * tx_dq is a direct function of tx_dqs's 6 LSBs. Some tests increment the value
1599 		 * of tx_dqs by a small value, which might cause the 6-bit value to overflow if
1600 		 * it's close to 0x3f. Increment the value by a small offset if it's likely
1601 		 * to overflow, to make sure it won't overflow while running tests and bricks
1602 		 * the system due to a non matching tx_dq.
1603 		 *
1604 		 * TODO: find out why some tests (edge write discovery) increment tx_dqs.
1605 		 */
1606 		if ((rn.start & 0x3f) == 0x3e)
1607 			rn.start += 2;
1608 		else if ((rn.start & 0x3f) == 0x3f)
1609 			rn.start += 1;
1610 
1611 		ctrl->timings[channel][slotrank].lanes[lane].tx_dqs = rn.start;
1612 		if (rn.all) {
1613 			printk(BIOS_EMERG, "JEDEC write leveling failed: %d, %d, %d\n",
1614 			       channel, slotrank, lane);
1615 
1616 			return MAKE_ERR;
1617 		}
1618 		printram("tx_dqs: %d, %d, %d: % 4d-% 4d-% 4d\n",
1619 				 channel, slotrank, lane, rn.start, rn.middle, rn.end);
1620 	}
1621 	return 0;
1622 }
1623 
get_dqs_flyby_adjust(u64 val)1624 static int get_dqs_flyby_adjust(u64 val)
1625 {
1626 	int i;
1627 	/* DQS is good enough */
1628 	if (val == 0xffffffffffffffffLL)
1629 		return 0;
1630 	if (val >= 0xf000000000000000LL) {
1631 		/* DQS is late, needs negative adjustment */
1632 		for (i = 0; i < 8; i++)
1633 			if (val << (8 * (7 - i) + 4))
1634 				return -i;
1635 	} else {
1636 		/* DQS is early, needs positive adjustment */
1637 		for (i = 0; i < 8; i++)
1638 			if (val >> (8 * (7 - i) + 4))
1639 				return i;
1640 	}
1641 	return 8;
1642 }
1643 
train_write_flyby(ramctr_timing * ctrl)1644 static void train_write_flyby(ramctr_timing *ctrl)
1645 {
1646 	int channel, slotrank, lane, old;
1647 
1648 	const union gdcr_training_mod_reg training_mod = {
1649 		.dq_dqs_training_res = 1,
1650 	};
1651 	mchbar_write32(GDCRTRAININGMOD, training_mod.raw);
1652 
1653 	FOR_ALL_POPULATED_CHANNELS {
1654 		fill_pattern1(ctrl, channel);
1655 	}
1656 	FOR_ALL_POPULATED_CHANNELS FOR_ALL_POPULATED_RANKS {
1657 		/* Reset read and write WDB pointers */
1658 		mchbar_write32(IOSAV_DATA_CTL_ch(channel), 0x10001);
1659 
1660 		wait_for_iosav(channel);
1661 
1662 		iosav_write_misc_write_sequence(ctrl, channel, slotrank, 3, 1, 3, 3, 31);
1663 
1664 		iosav_run_once_and_wait(channel);
1665 
1666 		const struct iosav_ssq rd_sequence[] = {
1667 			/* DRAM command PREA */
1668 			[0] = {
1669 				.sp_cmd_ctrl = {
1670 					.command    = IOSAV_PRE,
1671 					.ranksel_ap = 1,
1672 				},
1673 				.subseq_ctrl = {
1674 					.cmd_executions = 1,
1675 					.cmd_delay_gap  = 3,
1676 					.post_ssq_wait  = ctrl->tRP,
1677 					.data_direction = SSQ_NA,
1678 				},
1679 				.sp_cmd_addr = {
1680 					.address = 1 << 10,
1681 					.rowbits = 6,
1682 					.bank    = 0,
1683 					.rank    = slotrank,
1684 				},
1685 				.addr_update = {
1686 					.addr_wrap = 18,
1687 				},
1688 			},
1689 			/* DRAM command ACT */
1690 			[1] = {
1691 				.sp_cmd_ctrl = {
1692 					.command    = IOSAV_ACT,
1693 					.ranksel_ap = 1,
1694 				},
1695 				.subseq_ctrl = {
1696 					.cmd_executions = 1,
1697 					.cmd_delay_gap  = 3,
1698 					.post_ssq_wait  = ctrl->tRCD,
1699 					.data_direction = SSQ_NA,
1700 				},
1701 				.sp_cmd_addr = {
1702 					.address = 0,
1703 					.rowbits = 6,
1704 					.bank    = 0,
1705 					.rank    = slotrank,
1706 				},
1707 			},
1708 			/* DRAM command RDA */
1709 			[2] = {
1710 				.sp_cmd_ctrl = {
1711 					.command    = IOSAV_RD,
1712 					.ranksel_ap = 3,
1713 				},
1714 				.subseq_ctrl = {
1715 					.cmd_executions = 1,
1716 					.cmd_delay_gap  = 3,
1717 					.post_ssq_wait  = ctrl->tRP +
1718 				ctrl->timings[channel][slotrank].roundtrip_latency +
1719 				ctrl->timings[channel][slotrank].io_latency,
1720 					.data_direction = SSQ_RD,
1721 				},
1722 				.sp_cmd_addr = {
1723 					.address = 8,
1724 					.rowbits = 6,
1725 					.bank    = 0,
1726 					.rank    = slotrank,
1727 				},
1728 			},
1729 		};
1730 		iosav_write_sequence(channel, rd_sequence, ARRAY_SIZE(rd_sequence));
1731 
1732 		iosav_run_once_and_wait(channel);
1733 
1734 		FOR_ALL_LANES {
1735 			u64 res = mchbar_read32(lane_base[lane] + GDCRTRAININGRESULT1(channel));
1736 			res |= ((u64)mchbar_read32(lane_base[lane] +
1737 				GDCRTRAININGRESULT2(channel))) << 32;
1738 
1739 			old = ctrl->timings[channel][slotrank].lanes[lane].tx_dqs;
1740 			ctrl->timings[channel][slotrank].lanes[lane].tx_dqs +=
1741 				get_dqs_flyby_adjust(res) * QCLK_PI;
1742 
1743 			printram("High adjust %d:%016llx\n", lane, res);
1744 			printram("Bval+: %d, %d, %d, % 4d -> % 4d\n", channel, slotrank, lane,
1745 				old, ctrl->timings[channel][slotrank].lanes[lane].tx_dqs);
1746 		}
1747 	}
1748 	mchbar_write32(GDCRTRAININGMOD, 0);
1749 }
1750 
disable_refresh_machine(ramctr_timing * ctrl)1751 static void disable_refresh_machine(ramctr_timing *ctrl)
1752 {
1753 	int channel;
1754 
1755 	FOR_ALL_POPULATED_CHANNELS {
1756 		/* choose an existing rank */
1757 		const int slotrank = !(ctrl->rankmap[channel] & 1) ? 2 : 0;
1758 
1759 		iosav_write_zqcs_sequence(channel, slotrank, 4, 4, 31);
1760 
1761 		iosav_run_once_and_wait(channel);
1762 
1763 		mchbar_setbits32(SCHED_CBIT_ch(channel), 1 << 21);
1764 	}
1765 
1766 	/* Refresh disable */
1767 	mchbar_clrbits32(MC_INIT_STATE_G, 1 << 3);
1768 
1769 	FOR_ALL_POPULATED_CHANNELS {
1770 		/* Execute the same command queue */
1771 		iosav_run_once_and_wait(channel);
1772 	}
1773 }
1774 
1775 /*
1776  * Compensate the skew between CMD/ADDR/CLK and DQ/DQS lanes.
1777  *
1778  * Since DDR3 uses a fly-by topology, the data and strobes signals reach the chips at different
1779  * times with respect to command, address and clock signals. By delaying either all DQ/DQS or
1780  * all CMD/ADDR/CLK signals, a full phase shift can be introduced. It is assumed that the
1781  * CLK/ADDR/CMD signals have the same routing delay.
1782  *
1783  * To find the required phase shift the DRAM is placed in "write leveling" mode. In this mode,
1784  * the DRAM-chip samples the CLK on every DQS edge and feeds back the sampled value on the data
1785  * lanes (DQ).
1786  */
jedec_write_leveling(ramctr_timing * ctrl)1787 static int jedec_write_leveling(ramctr_timing *ctrl)
1788 {
1789 	int channel, slotrank;
1790 
1791 	disable_refresh_machine(ctrl);
1792 
1793 	/* Enable write leveling on all ranks
1794 	   Disable all DQ outputs
1795 	   Only NOP is allowed in this mode */
1796 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS
1797 		write_mrreg(ctrl, channel, slotrank, 1,
1798 				make_mr1(ctrl, slotrank, channel) | 1 << 12 | 1 << 7);
1799 
1800 	/* Needs to be programmed before I/O reset below */
1801 	const union gdcr_training_mod_reg training_mod = {
1802 		.write_leveling_mode = 1,
1803 		.enable_dqs_wl       = 5,
1804 		.odt_always_on       = 1,
1805 		.force_drive_enable  = 1,
1806 	};
1807 	mchbar_write32(GDCRTRAININGMOD, training_mod.raw);
1808 
1809 	toggle_io_reset();
1810 
1811 	/* Set any valid value for tx_dqs, it gets corrected later */
1812 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
1813 		const int err = write_level_rank(ctrl, channel, slotrank);
1814 		if (err)
1815 			return err;
1816 	}
1817 
1818 	/* Disable write leveling on all ranks */
1819 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS
1820 		write_mrreg(ctrl, channel, slotrank, 1, make_mr1(ctrl, slotrank, channel));
1821 
1822 	mchbar_write32(GDCRTRAININGMOD, 0);
1823 
1824 	FOR_ALL_POPULATED_CHANNELS
1825 		wait_for_iosav(channel);
1826 
1827 	/* Refresh enable */
1828 	mchbar_setbits32(MC_INIT_STATE_G, 1 << 3);
1829 
1830 	FOR_ALL_POPULATED_CHANNELS {
1831 		mchbar_clrbits32(SCHED_CBIT_ch(channel), 1 << 21);
1832 		mchbar_read32(IOSAV_STATUS_ch(channel));
1833 		wait_for_iosav(channel);
1834 
1835 		iosav_write_zqcs_sequence(channel, 0, 4, 101, 31);
1836 
1837 		iosav_run_once_and_wait(channel);
1838 	}
1839 
1840 	toggle_io_reset();
1841 
1842 	return 0;
1843 }
1844 
write_training(ramctr_timing * ctrl)1845 int write_training(ramctr_timing *ctrl)
1846 {
1847 	int channel, slotrank;
1848 	int err;
1849 
1850 	/*
1851 	 * Set the DEC_WRD bit, required for the write flyby algorithm.
1852 	 * Needs to be done before starting the write training procedure.
1853 	 */
1854 	FOR_ALL_POPULATED_CHANNELS
1855 		mchbar_setbits32(TC_RWP_ch(channel), 1 << 27);
1856 
1857 	printram("CPE\n");
1858 
1859 	err = jedec_write_leveling(ctrl);
1860 	if (err)
1861 		return err;
1862 
1863 	printram("CPF\n");
1864 
1865 	FOR_ALL_POPULATED_CHANNELS {
1866 		fill_pattern0(ctrl, channel, 0xaaaaaaaa, 0x55555555);
1867 	}
1868 
1869 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
1870 		err = tx_dq_write_leveling(ctrl, channel, slotrank);
1871 		if (err)
1872 			return err;
1873 	}
1874 
1875 	FOR_ALL_POPULATED_CHANNELS
1876 		program_timings(ctrl, channel);
1877 
1878 	/* measure and adjust tx_dqs timings */
1879 	train_write_flyby(ctrl);
1880 
1881 	FOR_ALL_POPULATED_CHANNELS
1882 		program_timings(ctrl, channel);
1883 
1884 	return 0;
1885 }
1886 
test_command_training(ramctr_timing * ctrl,int channel,int slotrank)1887 static int test_command_training(ramctr_timing *ctrl, int channel, int slotrank)
1888 {
1889 	struct ram_rank_timings saved_rt = ctrl->timings[channel][slotrank];
1890 	int tx_dq_delta;
1891 	int lanes_ok = 0;
1892 	int ctr = 0;
1893 	int lane;
1894 
1895 	for (tx_dq_delta = -5; tx_dq_delta <= 5; tx_dq_delta++) {
1896 		FOR_ALL_LANES {
1897 			ctrl->timings[channel][slotrank].lanes[lane].tx_dq =
1898 			    saved_rt.lanes[lane].tx_dq + tx_dq_delta;
1899 		}
1900 		program_timings(ctrl, channel);
1901 		FOR_ALL_LANES {
1902 			mchbar_write32(IOSAV_By_ERROR_COUNT(lane), 0);
1903 		}
1904 
1905 		/* Reset read WDB pointer */
1906 		mchbar_write32(IOSAV_DATA_CTL_ch(channel), 0x1f);
1907 
1908 		wait_for_iosav(channel);
1909 
1910 		iosav_write_command_training_sequence(ctrl, channel, slotrank, ctr);
1911 
1912 		/* Program LFSR for the RD/WR subsequences */
1913 		mchbar_write32(IOSAV_n_ADDRESS_LFSR_ch(channel, 1), 0x389abcd);
1914 		mchbar_write32(IOSAV_n_ADDRESS_LFSR_ch(channel, 2), 0x389abcd);
1915 
1916 		iosav_run_once_and_wait(channel);
1917 
1918 		FOR_ALL_LANES {
1919 			u32 r32 = mchbar_read32(IOSAV_By_ERROR_COUNT_ch(channel, lane));
1920 
1921 			if (r32 == 0)
1922 				lanes_ok |= 1 << lane;
1923 		}
1924 		ctr++;
1925 		if (lanes_ok == ((1 << ctrl->lanes) - 1))
1926 			break;
1927 	}
1928 
1929 	ctrl->timings[channel][slotrank] = saved_rt;
1930 
1931 	return lanes_ok != ((1 << ctrl->lanes) - 1);
1932 }
1933 
fill_pattern5(ramctr_timing * ctrl,int channel,int patno)1934 static void fill_pattern5(ramctr_timing *ctrl, int channel, int patno)
1935 {
1936 	unsigned int i, j;
1937 	unsigned int offset = get_precedening_channels(ctrl, channel) * 64;
1938 	unsigned int step = 64 * num_of_channels(ctrl);
1939 	uintptr_t addr;
1940 
1941 	if (patno) {
1942 		u8 base8 = 0x80 >> ((patno - 1) % 8);
1943 		u32 base = base8 | (base8 << 8) | (base8 << 16) | (base8 << 24);
1944 		for (i = 0; i < 32; i++) {
1945 			for (j = 0; j < 16; j++) {
1946 				u32 val = use_base[patno - 1][i] & (1 << (j / 2)) ? base : 0;
1947 
1948 				if (invert[patno - 1][i] & (1 << (j / 2)))
1949 					val = ~val;
1950 
1951 				addr = (1 << 26) + offset + i * step + j * 4;
1952 				write32p(addr, val);
1953 			}
1954 		}
1955 	} else {
1956 		for (i = 0; i < ARRAY_SIZE(pattern); i++) {
1957 			for (j = 0; j < 16; j++) {
1958 				const u32 val = pattern[i][j];
1959 				addr = (1 << 26) + offset + i * step + j * 4;
1960 				write32p(addr, val);
1961 			}
1962 		}
1963 		sfence();
1964 	}
1965 
1966 	program_wdb_pattern_length(channel, 256);
1967 }
1968 
reprogram_320c(ramctr_timing * ctrl)1969 static void reprogram_320c(ramctr_timing *ctrl)
1970 {
1971 	disable_refresh_machine(ctrl);
1972 
1973 	/* JEDEC reset */
1974 	dram_jedecreset(ctrl);
1975 
1976 	/* MRS commands */
1977 	dram_mrscommands(ctrl);
1978 
1979 	toggle_io_reset();
1980 }
1981 
1982 #define CT_MIN_PI	(-CCC_MAX_PI)
1983 #define CT_MAX_PI	(+CCC_MAX_PI + 1)
1984 #define CT_PI_LENGTH	(CT_MAX_PI - CT_MIN_PI + 1)
1985 
1986 #define MIN_C320C_LEN 13
1987 
try_cmd_stretch(ramctr_timing * ctrl,int channel,int cmd_stretch)1988 static int try_cmd_stretch(ramctr_timing *ctrl, int channel, int cmd_stretch)
1989 {
1990 	struct ram_rank_timings saved_timings[NUM_CHANNELS][NUM_SLOTRANKS];
1991 	int slotrank;
1992 	int command_pi;
1993 	int stat[NUM_SLOTRANKS][CT_PI_LENGTH];
1994 	int delta = 0;
1995 
1996 	printram("Trying cmd_stretch %d on channel %d\n", cmd_stretch, channel);
1997 
1998 	FOR_ALL_POPULATED_RANKS {
1999 		saved_timings[channel][slotrank] = ctrl->timings[channel][slotrank];
2000 	}
2001 
2002 	ctrl->cmd_stretch[channel] = cmd_stretch;
2003 
2004 	const union tc_rap_reg tc_rap = {
2005 		.tRRD    = ctrl->tRRD,
2006 		.tRTP    = ctrl->tRTP,
2007 		.tCKE    = ctrl->tCKE,
2008 		.tWTR    = ctrl->tWTR,
2009 		.tFAW    = ctrl->tFAW,
2010 		.tWR     = ctrl->tWR,
2011 		.tCMD    = ctrl->cmd_stretch[channel],
2012 	};
2013 	mchbar_write32(TC_RAP_ch(channel), tc_rap.raw);
2014 
2015 	if (ctrl->cmd_stretch[channel] == 2)
2016 		delta = 2;
2017 	else if (ctrl->cmd_stretch[channel] == 0)
2018 		delta = 4;
2019 
2020 	FOR_ALL_POPULATED_RANKS {
2021 		ctrl->timings[channel][slotrank].roundtrip_latency -= delta;
2022 	}
2023 
2024 	for (command_pi = CT_MIN_PI; command_pi < CT_MAX_PI; command_pi++) {
2025 		FOR_ALL_POPULATED_RANKS {
2026 			ctrl->timings[channel][slotrank].pi_coding = command_pi;
2027 		}
2028 		program_timings(ctrl, channel);
2029 		reprogram_320c(ctrl);
2030 		FOR_ALL_POPULATED_RANKS {
2031 			stat[slotrank][command_pi - CT_MIN_PI] =
2032 				test_command_training(ctrl, channel, slotrank);
2033 		}
2034 	}
2035 	FOR_ALL_POPULATED_RANKS {
2036 		struct run rn = get_longest_zero_run(stat[slotrank], CT_PI_LENGTH - 1);
2037 
2038 		ctrl->timings[channel][slotrank].pi_coding = rn.middle + CT_MIN_PI;
2039 		printram("cmd_stretch: %d, %d: % 4d-% 4d-% 4d\n",
2040 				 channel, slotrank, rn.start, rn.middle, rn.end);
2041 
2042 		if (rn.all || rn.length < MIN_C320C_LEN) {
2043 			FOR_ALL_POPULATED_RANKS {
2044 				ctrl->timings[channel][slotrank] =
2045 					saved_timings[channel][slotrank];
2046 			}
2047 			return MAKE_ERR;
2048 		}
2049 	}
2050 
2051 	return 0;
2052 }
2053 
2054 /*
2055  * Adjust CMD phase shift and try multiple command rates.
2056  * A command rate of 2T doubles the time needed for address and command decode.
2057  */
command_training(ramctr_timing * ctrl)2058 int command_training(ramctr_timing *ctrl)
2059 {
2060 	int channel;
2061 
2062 	FOR_ALL_POPULATED_CHANNELS {
2063 		fill_pattern5(ctrl, channel, 0);
2064 	}
2065 
2066 	FOR_ALL_POPULATED_CHANNELS {
2067 		int cmdrate, err;
2068 
2069 		/*
2070 		 * Dual DIMM per channel:
2071 		 * Issue:
2072 		 * While command training seems to succeed, raminit will fail in write training.
2073 		 *
2074 		 * Workaround:
2075 		 * Skip 1T in dual DIMM mode, that's only supported by a few DIMMs.
2076 		 * Only try 1T mode for XMP DIMMs that request it in dual DIMM mode.
2077 		 *
2078 		 * Single DIMM per channel:
2079 		 * Try command rate 1T and 2T
2080 		 */
2081 		cmdrate = ((ctrl->rankmap[channel] & 0x5) == 0x5);
2082 		if (ctrl->tCMD)
2083 			/* XMP gives the CMD rate in clock ticks, not ns */
2084 			cmdrate = MIN(DIV_ROUND_UP(ctrl->tCMD, 256) - 1, 1);
2085 
2086 		for (; cmdrate < 2; cmdrate++) {
2087 			err = try_cmd_stretch(ctrl, channel, cmdrate << 1);
2088 
2089 			if (!err)
2090 				break;
2091 		}
2092 
2093 		if (err) {
2094 			printk(BIOS_EMERG, "Command training failed: %d\n", channel);
2095 			return err;
2096 		}
2097 
2098 		printram("Using CMD rate %uT on channel %u\n", cmdrate + 1, channel);
2099 	}
2100 
2101 	FOR_ALL_POPULATED_CHANNELS
2102 		program_timings(ctrl, channel);
2103 
2104 	reprogram_320c(ctrl);
2105 	return 0;
2106 }
2107 
find_read_mpr_margin(ramctr_timing * ctrl,int channel,int slotrank,int * edges)2108 static int find_read_mpr_margin(ramctr_timing *ctrl, int channel, int slotrank, int *edges)
2109 {
2110 	int dqs_pi;
2111 	int stats[NUM_LANES][MAX_EDGE_TIMING + 1];
2112 	int lane;
2113 
2114 	for (dqs_pi = 0; dqs_pi <= MAX_EDGE_TIMING; dqs_pi++) {
2115 		FOR_ALL_LANES {
2116 			ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_p = dqs_pi;
2117 			ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_n = dqs_pi;
2118 		}
2119 		program_timings(ctrl, channel);
2120 
2121 		FOR_ALL_LANES {
2122 			mchbar_write32(IOSAV_By_ERROR_COUNT_ch(channel, lane), 0);
2123 			mchbar_read32(IOSAV_By_BW_SERROR_C_ch(channel, lane));
2124 		}
2125 
2126 		wait_for_iosav(channel);
2127 
2128 		iosav_write_read_mpr_sequence(
2129 			channel, slotrank, ctrl->tMOD, 500, 4, 1, ctrl->CAS + 8);
2130 
2131 		iosav_run_once_and_wait(channel);
2132 
2133 		FOR_ALL_LANES {
2134 			stats[lane][dqs_pi] = mchbar_read32(
2135 				IOSAV_By_ERROR_COUNT_ch(channel, lane));
2136 		}
2137 	}
2138 
2139 	FOR_ALL_LANES {
2140 		struct run rn = get_longest_zero_run(stats[lane], MAX_EDGE_TIMING + 1);
2141 		edges[lane] = rn.middle;
2142 
2143 		if (rn.all) {
2144 			printk(BIOS_EMERG, "Read MPR training failed: %d, %d, %d\n", channel,
2145 			       slotrank, lane);
2146 			return MAKE_ERR;
2147 		}
2148 		printram("eval %d, %d, %d: % 4d\n", channel, slotrank, lane, edges[lane]);
2149 	}
2150 	return 0;
2151 }
2152 
find_predefined_pattern(ramctr_timing * ctrl,const int channel)2153 static void find_predefined_pattern(ramctr_timing *ctrl, const int channel)
2154 {
2155 	int slotrank, lane;
2156 
2157 	fill_pattern0(ctrl, channel, 0, 0);
2158 	FOR_ALL_LANES {
2159 		mchbar_write32(IOSAV_By_BW_MASK_ch(channel, lane), 0);
2160 		mchbar_read32(IOSAV_By_BW_SERROR_C_ch(channel, lane));
2161 	}
2162 
2163 	FOR_ALL_POPULATED_RANKS FOR_ALL_LANES {
2164 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_n = 16;
2165 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_p = 16;
2166 	}
2167 
2168 	program_timings(ctrl, channel);
2169 
2170 	FOR_ALL_POPULATED_RANKS {
2171 		wait_for_iosav(channel);
2172 
2173 		iosav_write_read_mpr_sequence(
2174 			channel, slotrank, ctrl->tMOD, 3, 4, 1, ctrl->CAS + 8);
2175 
2176 		iosav_run_once_and_wait(channel);
2177 	}
2178 
2179 	/* XXX: check any measured value ? */
2180 
2181 	FOR_ALL_POPULATED_RANKS FOR_ALL_LANES {
2182 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_n = 48;
2183 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_p = 48;
2184 	}
2185 
2186 	program_timings(ctrl, channel);
2187 
2188 	FOR_ALL_POPULATED_RANKS {
2189 		wait_for_iosav(channel);
2190 
2191 		iosav_write_read_mpr_sequence(
2192 			channel, slotrank, ctrl->tMOD, 3, 4, 1, ctrl->CAS + 8);
2193 
2194 		iosav_run_once_and_wait(channel);
2195 	}
2196 
2197 	/* XXX: check any measured value ? */
2198 
2199 	FOR_ALL_LANES {
2200 		mchbar_write32(IOSAV_By_BW_MASK_ch(channel, lane),
2201 			~mchbar_read32(IOSAV_By_BW_SERROR_ch(channel, lane)) & 0xff);
2202 	}
2203 }
2204 
read_mpr_training(ramctr_timing * ctrl)2205 int read_mpr_training(ramctr_timing *ctrl)
2206 {
2207 	int falling_edges[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
2208 	int rising_edges[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
2209 	int channel, slotrank, lane;
2210 	int err;
2211 
2212 	mchbar_write32(GDCRTRAININGMOD, 0);
2213 
2214 	toggle_io_reset();
2215 
2216 	FOR_ALL_POPULATED_CHANNELS {
2217 		find_predefined_pattern(ctrl, channel);
2218 
2219 		fill_pattern0(ctrl, channel, 0, 0xffffffff);
2220 	}
2221 
2222 	/*
2223 	 * FIXME: Under some conditions, vendor BIOS sets both edges to the same value. It will
2224 	 *        also use a single loop. It would seem that it is a debugging configuration.
2225 	 */
2226 	mchbar_write32(IOSAV_DC_MASK, 3 << 8);
2227 	printram("discover falling edges:\n[%x] = %x\n", IOSAV_DC_MASK, 3 << 8);
2228 
2229 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
2230 		err = find_read_mpr_margin(ctrl, channel, slotrank,
2231 			falling_edges[channel][slotrank]);
2232 		if (err)
2233 			return err;
2234 	}
2235 
2236 	mchbar_write32(IOSAV_DC_MASK, 2 << 8);
2237 	printram("discover rising edges:\n[%x] = %x\n", IOSAV_DC_MASK, 2 << 8);
2238 
2239 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
2240 		err = find_read_mpr_margin(ctrl, channel, slotrank,
2241 				    rising_edges[channel][slotrank]);
2242 		if (err)
2243 			return err;
2244 	}
2245 
2246 	mchbar_write32(IOSAV_DC_MASK, 0);
2247 
2248 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS FOR_ALL_LANES {
2249 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_n =
2250 		    falling_edges[channel][slotrank][lane];
2251 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_p =
2252 		    rising_edges[channel][slotrank][lane];
2253 	}
2254 
2255 	FOR_ALL_POPULATED_CHANNELS {
2256 		program_timings(ctrl, channel);
2257 	}
2258 
2259 	FOR_ALL_POPULATED_CHANNELS FOR_ALL_LANES {
2260 		mchbar_write32(IOSAV_By_BW_MASK_ch(channel, lane), 0);
2261 	}
2262 	return 0;
2263 }
2264 
find_agrsv_read_margin(ramctr_timing * ctrl,int channel,int slotrank,int * edges)2265 static int find_agrsv_read_margin(ramctr_timing *ctrl, int channel, int slotrank, int *edges)
2266 {
2267 	const int rd_vref_offsets[] = { 0, 0xc, 0x2c };
2268 
2269 	u32 raw_stats[MAX_EDGE_TIMING + 1];
2270 	int lower[NUM_LANES];
2271 	int upper[NUM_LANES];
2272 	int lane, i, read_pi, pat;
2273 
2274 	FOR_ALL_LANES {
2275 		lower[lane] = 0;
2276 		upper[lane] = MAX_EDGE_TIMING;
2277 	}
2278 
2279 	for (i = 0; i < ARRAY_SIZE(rd_vref_offsets); i++) {
2280 		const union gdcr_training_mod_reg training_mod = {
2281 			.vref_gen_ctl = rd_vref_offsets[i],
2282 		};
2283 		mchbar_write32(GDCRTRAININGMOD_ch(channel), training_mod.raw);
2284 		printram("[%x] = 0x%08x\n", GDCRTRAININGMOD_ch(channel), training_mod.raw);
2285 
2286 		for (pat = 0; pat < NUM_PATTERNS; pat++) {
2287 			fill_pattern5(ctrl, channel, pat);
2288 			printram("using pattern %d\n", pat);
2289 
2290 			for (read_pi = 0; read_pi <= MAX_EDGE_TIMING; read_pi++) {
2291 				FOR_ALL_LANES {
2292 					ctrl->timings[channel][slotrank].lanes[lane]
2293 						.rx_dqs_p = read_pi;
2294 					ctrl->timings[channel][slotrank].lanes[lane]
2295 						.rx_dqs_n = read_pi;
2296 				}
2297 				program_timings(ctrl, channel);
2298 
2299 				FOR_ALL_LANES {
2300 					mchbar_write32(IOSAV_By_ERROR_COUNT_ch(channel, lane),
2301 							0);
2302 					mchbar_read32(IOSAV_By_BW_SERROR_C_ch(channel, lane));
2303 				}
2304 				wait_for_iosav(channel);
2305 
2306 				iosav_write_data_write_sequence(ctrl, channel, slotrank);
2307 
2308 				iosav_run_once_and_wait(channel);
2309 
2310 				FOR_ALL_LANES {
2311 					mchbar_read32(IOSAV_By_ERROR_COUNT_ch(channel, lane));
2312 				}
2313 
2314 				/* FIXME: This register only exists on Ivy Bridge */
2315 				raw_stats[read_pi] = mchbar_read32(
2316 					IOSAV_BYTE_SERROR_C_ch(channel));
2317 			}
2318 
2319 			FOR_ALL_LANES {
2320 				int stats[MAX_EDGE_TIMING + 1];
2321 				struct run rn;
2322 
2323 				for (read_pi = 0; read_pi <= MAX_EDGE_TIMING; read_pi++)
2324 					stats[read_pi] = !!(raw_stats[read_pi] & (1 << lane));
2325 
2326 				rn = get_longest_zero_run(stats, MAX_EDGE_TIMING + 1);
2327 
2328 				printram("edges: %d, %d, %d: % 4d-% 4d-% 4d, "
2329 					 "% 4d-% 4d\n", channel, slotrank, i, rn.start,
2330 					 rn.middle, rn.end, rn.start + ctrl->edge_offset[i],
2331 					 rn.end - ctrl->edge_offset[i]);
2332 
2333 				lower[lane] = MAX(rn.start + ctrl->edge_offset[i], lower[lane]);
2334 				upper[lane] = MIN(rn.end   - ctrl->edge_offset[i], upper[lane]);
2335 
2336 				edges[lane] = (lower[lane] + upper[lane]) / 2;
2337 				if (rn.all || (lower[lane] > upper[lane])) {
2338 					printk(BIOS_EMERG, "Aggressive read training failed: "
2339 						"%d, %d, %d\n", channel, slotrank, lane);
2340 
2341 					return MAKE_ERR;
2342 				}
2343 			}
2344 		}
2345 	}
2346 
2347 	/* Restore nominal Vref after training */
2348 	mchbar_write32(GDCRTRAININGMOD_ch(channel), 0);
2349 	printram("CPA\n");
2350 	return 0;
2351 }
2352 
aggressive_read_training(ramctr_timing * ctrl)2353 int aggressive_read_training(ramctr_timing *ctrl)
2354 {
2355 	int falling_edges[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
2356 	int  rising_edges[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
2357 	int channel, slotrank, lane, err;
2358 
2359 	/*
2360 	 * FIXME: Under some conditions, vendor BIOS sets both edges to the same value. It will
2361 	 *        also use a single loop. It would seem that it is a debugging configuration.
2362 	 */
2363 	mchbar_write32(IOSAV_DC_MASK, 3 << 8);
2364 	printram("discover falling edges aggressive:\n[%x] = %x\n", IOSAV_DC_MASK, 3 << 8);
2365 
2366 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
2367 		err = find_agrsv_read_margin(ctrl, channel, slotrank,
2368 					falling_edges[channel][slotrank]);
2369 		if (err)
2370 			return err;
2371 	}
2372 
2373 	mchbar_write32(IOSAV_DC_MASK, 2 << 8);
2374 	printram("discover rising edges aggressive:\n[%x] = %x\n", IOSAV_DC_MASK, 2 << 8);
2375 
2376 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
2377 		err = find_agrsv_read_margin(ctrl, channel, slotrank,
2378 					 rising_edges[channel][slotrank]);
2379 		if (err)
2380 			return err;
2381 	}
2382 
2383 	mchbar_write32(IOSAV_DC_MASK, 0);
2384 
2385 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS FOR_ALL_LANES {
2386 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_n =
2387 				falling_edges[channel][slotrank][lane];
2388 
2389 		ctrl->timings[channel][slotrank].lanes[lane].rx_dqs_p =
2390 				rising_edges[channel][slotrank][lane];
2391 	}
2392 
2393 	FOR_ALL_POPULATED_CHANNELS
2394 		program_timings(ctrl, channel);
2395 
2396 	return 0;
2397 }
2398 
test_aggressive_write(ramctr_timing * ctrl,int channel,int slotrank)2399 static void test_aggressive_write(ramctr_timing *ctrl, int channel, int slotrank)
2400 {
2401 	wait_for_iosav(channel);
2402 
2403 	iosav_write_aggressive_write_read_sequence(ctrl, channel, slotrank);
2404 
2405 	iosav_run_once_and_wait(channel);
2406 }
2407 
set_write_vref(const int channel,const u8 wr_vref)2408 static void set_write_vref(const int channel, const u8 wr_vref)
2409 {
2410 	mchbar_clrsetbits32(GDCRCMDDEBUGMUXCFG_Cz_S(channel), 0x3f << 24, wr_vref << 24);
2411 	udelay(2);
2412 }
2413 
aggressive_write_training(ramctr_timing * ctrl)2414 int aggressive_write_training(ramctr_timing *ctrl)
2415 {
2416 	const u8 wr_vref_offsets[3] = { 0, 0x0f, 0x2f };
2417 	int i, pat;
2418 
2419 	int lower[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
2420 	int upper[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
2421 	int channel, slotrank, lane;
2422 
2423 	/* Changing the write Vref is only supported on some Ivy Bridge SKUs */
2424 	if (!IS_IVY_CPU(ctrl->cpu))
2425 		return 0;
2426 
2427 	if (!(pci_read_config32(HOST_BRIDGE, CAPID0_A) & CAPID_WRTVREF))
2428 		return 0;
2429 
2430 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS FOR_ALL_LANES {
2431 		lower[channel][slotrank][lane] = 0;
2432 		upper[channel][slotrank][lane] = MAX_TX_DQ;
2433 	}
2434 
2435 	/* Only enable IOSAV_n_SPECIAL_COMMAND_ADDR optimization on later steppings */
2436 	const bool enable_iosav_opt = IS_IVY_CPU_D(ctrl->cpu) || IS_IVY_CPU_E(ctrl->cpu);
2437 
2438 	if (enable_iosav_opt)
2439 		mchbar_write32(MCMNTS_SPARE, 1);
2440 
2441 	printram("Aggressive write training:\n");
2442 
2443 	for (i = 0; i < ARRAY_SIZE(wr_vref_offsets); i++) {
2444 		FOR_ALL_POPULATED_CHANNELS {
2445 			set_write_vref(channel, wr_vref_offsets[i]);
2446 
2447 			for (pat = 0; pat < NUM_PATTERNS; pat++) {
2448 				FOR_ALL_POPULATED_RANKS {
2449 					int tx_dq;
2450 					u32 raw_stats[MAX_TX_DQ + 1];
2451 					int stats[MAX_TX_DQ + 1];
2452 
2453 					/* Make sure rn.start < rn.end */
2454 					stats[MAX_TX_DQ] = 1;
2455 
2456 					fill_pattern5(ctrl, channel, pat);
2457 
2458 					for (tx_dq = 0; tx_dq < MAX_TX_DQ; tx_dq++) {
2459 						FOR_ALL_LANES {
2460 							ctrl->timings[channel][slotrank]
2461 								.lanes[lane].tx_dq = tx_dq;
2462 						}
2463 						program_timings(ctrl, channel);
2464 
2465 						test_aggressive_write(ctrl, channel, slotrank);
2466 
2467 						raw_stats[tx_dq] = mchbar_read32(
2468 							IOSAV_BYTE_SERROR_C_ch(channel));
2469 					}
2470 					FOR_ALL_LANES {
2471 						struct run rn;
2472 						for (tx_dq = 0; tx_dq < MAX_TX_DQ; tx_dq++) {
2473 							stats[tx_dq] = !!(raw_stats[tx_dq]
2474 									& (1 << lane));
2475 						}
2476 
2477 						rn = get_longest_zero_run(stats, MAX_TX_DQ + 1);
2478 						if (rn.all) {
2479 							printk(BIOS_EMERG, "Aggressive "
2480 								"write training failed: "
2481 								"%d, %d, %d\n", channel,
2482 								slotrank, lane);
2483 
2484 							return MAKE_ERR;
2485 						}
2486 						printram("tx_dq: %d, %d, %d: "
2487 							 "% 4d-% 4d-% 4d, "
2488 							 "% 4d-% 4d\n", channel, slotrank,
2489 							 i, rn.start, rn.middle, rn.end,
2490 							 rn.start + ctrl->tx_dq_offset[i],
2491 							 rn.end   - ctrl->tx_dq_offset[i]);
2492 
2493 						lower[channel][slotrank][lane] =
2494 							MAX(rn.start + ctrl->tx_dq_offset[i],
2495 							    lower[channel][slotrank][lane]);
2496 
2497 						upper[channel][slotrank][lane] =
2498 							MIN(rn.end - ctrl->tx_dq_offset[i],
2499 							    upper[channel][slotrank][lane]);
2500 					}
2501 				}
2502 			}
2503 		}
2504 	}
2505 
2506 	FOR_ALL_CHANNELS {
2507 		/* Restore nominal write Vref after training */
2508 		set_write_vref(channel, 0);
2509 	}
2510 
2511 	/* Disable IOSAV_n_SPECIAL_COMMAND_ADDR optimization */
2512 	if (enable_iosav_opt)
2513 		mchbar_write32(MCMNTS_SPARE, 0);
2514 
2515 	printram("CPB\n");
2516 
2517 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS FOR_ALL_LANES {
2518 		printram("tx_dq %d, %d, %d: % 4d\n", channel, slotrank, lane,
2519 		       (lower[channel][slotrank][lane] +
2520 			upper[channel][slotrank][lane]) / 2);
2521 
2522 		ctrl->timings[channel][slotrank].lanes[lane].tx_dq =
2523 		    (lower[channel][slotrank][lane] +
2524 		     upper[channel][slotrank][lane]) / 2;
2525 	}
2526 	FOR_ALL_POPULATED_CHANNELS {
2527 		program_timings(ctrl, channel);
2528 	}
2529 	return 0;
2530 }
2531 
normalize_training(ramctr_timing * ctrl)2532 void normalize_training(ramctr_timing *ctrl)
2533 {
2534 	int channel, slotrank, lane;
2535 	int mat;
2536 
2537 	FOR_ALL_CHANNELS FOR_ALL_POPULATED_RANKS {
2538 		int delta;
2539 		mat = 0;
2540 		FOR_ALL_LANES mat =
2541 		    MAX(ctrl->timings[channel][slotrank].lanes[lane].rcven, mat);
2542 		printram("normalize %d, %d, %d: mat %d\n",
2543 		    channel, slotrank, lane, mat);
2544 
2545 		delta = (mat >> 6) - ctrl->timings[channel][slotrank].io_latency;
2546 		printram("normalize %d, %d, %d: delta %d\n",
2547 		    channel, slotrank, lane, delta);
2548 
2549 		ctrl->timings[channel][slotrank].roundtrip_latency += delta;
2550 		ctrl->timings[channel][slotrank].io_latency += delta;
2551 	}
2552 
2553 	FOR_ALL_POPULATED_CHANNELS {
2554 		program_timings(ctrl, channel);
2555 	}
2556 }
2557 
channel_test(ramctr_timing * ctrl)2558 int channel_test(ramctr_timing *ctrl)
2559 {
2560 	int channel, slotrank, lane;
2561 
2562 	slotrank = 0;
2563 	FOR_ALL_POPULATED_CHANNELS
2564 		if (mchbar_read32(MC_INIT_STATE_ch(channel)) & 0xa000) {
2565 			printk(BIOS_EMERG, "Mini channel test failed (1): %d\n", channel);
2566 			return MAKE_ERR;
2567 		}
2568 	FOR_ALL_POPULATED_CHANNELS {
2569 		fill_pattern0(ctrl, channel, 0x12345678, 0x98765432);
2570 	}
2571 
2572 	for (slotrank = 0; slotrank < 4; slotrank++)
2573 		FOR_ALL_CHANNELS
2574 			if (ctrl->rankmap[channel] & (1 << slotrank)) {
2575 		FOR_ALL_LANES {
2576 			mchbar_write32(IOSAV_By_ERROR_COUNT(lane), 0);
2577 			mchbar_write32(IOSAV_By_BW_SERROR_C(lane), 0);
2578 		}
2579 		wait_for_iosav(channel);
2580 
2581 		iosav_write_memory_test_sequence(ctrl, channel, slotrank);
2582 
2583 		iosav_run_once_and_wait(channel);
2584 
2585 		FOR_ALL_LANES
2586 			if (mchbar_read32(IOSAV_By_ERROR_COUNT_ch(channel, lane))) {
2587 				printk(BIOS_EMERG, "Mini channel test failed (2): %d, %d, %d\n",
2588 				       channel, slotrank, lane);
2589 				return MAKE_ERR;
2590 			}
2591 	}
2592 	return 0;
2593 }
2594 
channel_scrub(ramctr_timing * ctrl)2595 void channel_scrub(ramctr_timing *ctrl)
2596 {
2597 	int channel, slotrank, row, rowsize;
2598 	u8 bank;
2599 
2600 	FOR_ALL_POPULATED_CHANNELS {
2601 		wait_for_iosav(channel);
2602 		fill_pattern0(ctrl, channel, 0, 0);
2603 	}
2604 
2605 	/*
2606 	 * During runtime the "scrubber" will periodically scan through the memory in the
2607 	 * physical address space, to identify and fix CRC errors.
2608 	 * The following loops writes to every DRAM address, setting the ECC bits to the
2609 	 * correct value. A read from this location will no longer return a CRC error,
2610 	 * except when a bit has toggled due to external events.
2611 	 * The same could be achieved by writing to the physical memory map, but it's
2612 	 * much more difficult due to SMM remapping, ME stolen memory, GFX stolen memory,
2613 	 * and firmware running in x86_32.
2614 	 */
2615 	FOR_ALL_POPULATED_CHANNELS FOR_ALL_POPULATED_RANKS {
2616 		rowsize = 1 << ctrl->info.dimm[channel][slotrank >> 1].row_bits;
2617 		for (bank = 0; bank < 8; bank++) {
2618 			for (row = 0; row < rowsize; row += 16) {
2619 				u8 gap = MAX((ctrl->tFAW >> 2) + 1, ctrl->tRRD);
2620 				const struct iosav_ssq sequence[] = {
2621 					/*
2622 					 * DRAM command ACT
2623 					 *  Opens the row for writing.
2624 					 */
2625 					[0] = {
2626 						.sp_cmd_ctrl = {
2627 							.command    = IOSAV_ACT,
2628 							.ranksel_ap = 1,
2629 						},
2630 						.subseq_ctrl = {
2631 							.cmd_executions = 1,
2632 							.cmd_delay_gap  = gap,
2633 							.post_ssq_wait  = ctrl->tRCD,
2634 							.data_direction = SSQ_NA,
2635 						},
2636 						.sp_cmd_addr = {
2637 							.address = row,
2638 							.rowbits = 6,
2639 							.bank    = bank,
2640 							.rank    = slotrank,
2641 						},
2642 						.addr_update = {
2643 							.inc_addr_1 = 1,
2644 							.addr_wrap  = 18,
2645 						},
2646 					},
2647 					/*
2648 					 * DRAM command WR
2649 					 *  Writes (128 + 1) * 8 (burst length) * 8 (bus width)
2650 					 *  bytes.
2651 					 */
2652 					[1] = {
2653 						.sp_cmd_ctrl = {
2654 							.command    = IOSAV_WR,
2655 							.ranksel_ap = 1,
2656 						},
2657 						.subseq_ctrl = {
2658 							.cmd_executions = 129,
2659 							.cmd_delay_gap  = 4,
2660 							.post_ssq_wait  = ctrl->tWTR +
2661 									  ctrl->CWL + 8,
2662 							.data_direction = SSQ_WR,
2663 						},
2664 						.sp_cmd_addr = {
2665 							.address = row,
2666 							.rowbits = 0,
2667 							.bank    = bank,
2668 							.rank    = slotrank,
2669 						},
2670 						.addr_update = {
2671 							.inc_addr_8 = 1,
2672 							.addr_wrap  = 9,
2673 						},
2674 					},
2675 					/*
2676 					 * DRAM command PRE
2677 					 *  Closes the row.
2678 					 */
2679 					[2] = {
2680 						.sp_cmd_ctrl = {
2681 							.command    = IOSAV_PRE,
2682 							.ranksel_ap = 1,
2683 						},
2684 						.subseq_ctrl = {
2685 							.cmd_executions = 1,
2686 							.cmd_delay_gap  = 4,
2687 							.post_ssq_wait  = ctrl->tRP,
2688 							.data_direction = SSQ_NA,
2689 						},
2690 						.sp_cmd_addr = {
2691 							.address = 0,
2692 							.rowbits = 6,
2693 							.bank    = bank,
2694 							.rank    = slotrank,
2695 						},
2696 						.addr_update = {
2697 							.addr_wrap = 18,
2698 						},
2699 					},
2700 				};
2701 				iosav_write_sequence(channel, sequence, ARRAY_SIZE(sequence));
2702 
2703 				iosav_run_queue(channel, 16, 0);
2704 
2705 				wait_for_iosav(channel);
2706 			}
2707 		}
2708 	}
2709 }
2710 
set_scrambling_seed(ramctr_timing * ctrl)2711 void set_scrambling_seed(ramctr_timing *ctrl)
2712 {
2713 	int channel;
2714 
2715 	/* FIXME: we hardcode seeds. Do we need to use some PRNG for them? I don't think so. */
2716 	static u32 seeds[NUM_CHANNELS][3] = {
2717 		{0x00009a36, 0xbafcfdcf, 0x46d1ab68},
2718 		{0x00028bfa, 0x53fe4b49, 0x19ed5483}
2719 	};
2720 	FOR_ALL_POPULATED_CHANNELS {
2721 		mchbar_clrbits32(SCHED_CBIT_ch(channel), 1 << 28);
2722 		mchbar_write32(SCRAMBLING_SEED_1_ch(channel),    seeds[channel][0]);
2723 		mchbar_write32(SCRAMBLING_SEED_2_HI_ch(channel), seeds[channel][1]);
2724 		mchbar_write32(SCRAMBLING_SEED_2_LO_ch(channel), seeds[channel][2]);
2725 	}
2726 }
2727 
set_wmm_behavior(const u32 cpu)2728 void set_wmm_behavior(const u32 cpu)
2729 {
2730 	if (IS_SANDY_CPU(cpu) && (IS_SANDY_CPU_D0(cpu) || IS_SANDY_CPU_D1(cpu))) {
2731 		mchbar_write32(SC_WDBWM, 0x141d1519);
2732 	} else {
2733 		mchbar_write32(SC_WDBWM, 0x551d1519);
2734 	}
2735 }
2736 
prepare_training(ramctr_timing * ctrl)2737 void prepare_training(ramctr_timing *ctrl)
2738 {
2739 	int channel;
2740 
2741 	FOR_ALL_POPULATED_CHANNELS {
2742 		/* Always drive command bus */
2743 		mchbar_setbits32(TC_RAP_ch(channel), 1 << 29);
2744 	}
2745 
2746 	udelay(1);
2747 
2748 	FOR_ALL_POPULATED_CHANNELS {
2749 		wait_for_iosav(channel);
2750 	}
2751 }
2752 
set_read_write_timings(ramctr_timing * ctrl)2753 void set_read_write_timings(ramctr_timing *ctrl)
2754 {
2755 	/* Use a larger delay when running fast to improve stability */
2756 	const u32 tRWDRDD_inc = ctrl->tCK <= TCK_1066MHZ ? 4 : 2;
2757 
2758 	int channel, slotrank;
2759 
2760 	FOR_ALL_POPULATED_CHANNELS {
2761 		int min_pi = 10000;
2762 		int max_pi = -10000;
2763 
2764 		FOR_ALL_POPULATED_RANKS {
2765 			max_pi = MAX(ctrl->timings[channel][slotrank].pi_coding, max_pi);
2766 			min_pi = MIN(ctrl->timings[channel][slotrank].pi_coding, min_pi);
2767 		}
2768 
2769 		const u32 tWRDRDD = (max_pi - min_pi > 51) ? 0 : ctrl->ref_card_offset[channel];
2770 
2771 		const u32 val = (ctrl->pi_coding_threshold < max_pi - min_pi) ? 3 : 2;
2772 
2773 		dram_odt_stretch(ctrl, channel);
2774 
2775 		const union tc_rwp_reg tc_rwp = {
2776 			.tRRDR   = 0,
2777 			.tRRDD   = val,
2778 			.tWWDR   = val,
2779 			.tWWDD   = val,
2780 			.tRWDRDD = ctrl->ref_card_offset[channel] + tRWDRDD_inc,
2781 			.tWRDRDD = tWRDRDD,
2782 			.tRWSR   = 2,
2783 			.dec_wrd = 1,
2784 		};
2785 		mchbar_write32(TC_RWP_ch(channel), tc_rwp.raw);
2786 	}
2787 }
2788 
set_normal_operation(ramctr_timing * ctrl)2789 void set_normal_operation(ramctr_timing *ctrl)
2790 {
2791 	int channel;
2792 	FOR_ALL_POPULATED_CHANNELS {
2793 		mchbar_write32(MC_INIT_STATE_ch(channel), 1 << 12 | ctrl->rankmap[channel]);
2794 		mchbar_clrbits32(TC_RAP_ch(channel), 1 << 29);
2795 	}
2796 }
2797 
2798 /* Encode the watermark latencies in a suitable format for graphics drivers consumption */
encode_wm(int ns)2799 static int encode_wm(int ns)
2800 {
2801 	return (ns + 499) / 500;
2802 }
2803 
2804 /* FIXME: values in this function should be hardware revision-dependent */
final_registers(ramctr_timing * ctrl)2805 void final_registers(ramctr_timing *ctrl)
2806 {
2807 	int channel;
2808 	int t1_cycles = 0, t1_ns = 0, t2_ns;
2809 	int t3_ns;
2810 	u32 r32;
2811 
2812 	if (IS_IVY_CPU(ctrl->cpu))
2813 		mchbar_write32(WMM_READ_CONFIG, 0x46);
2814 
2815 	FOR_ALL_CHANNELS {
2816 		union tc_othp_reg tc_othp = {
2817 			.raw = mchbar_read32(TC_OTHP_ch(channel)),
2818 		};
2819 		if (IS_SANDY_CPU(ctrl->cpu) && (ctrl->cpu & 0xf) < SNB_STEP_D0)
2820 			tc_othp.tCPDED = 2;
2821 		else
2822 			tc_othp.tCPDED = 1;
2823 		mchbar_write32(TC_OTHP_ch(channel), tc_othp.raw);
2824 
2825 		/* 64 DCLKs until idle, decision per rank */
2826 		r32 = get_power_down_mode(ctrl, channel) << 8 | 64;
2827 		mchbar_write32(PM_PDWN_CONFIG_ch(channel), r32);
2828 
2829 		mchbar_write32(PM_TRML_M_CONFIG_ch(channel), 0x00000aaa);
2830 	}
2831 
2832 	mchbar_write32(PM_BW_LIMIT_CONFIG, 0x5f7003ff);
2833 	if (IS_SANDY_CPU(ctrl->cpu))
2834 		mchbar_write32(PM_DLL_CONFIG, 0x000330f0);
2835 	else
2836 		mchbar_write32(PM_DLL_CONFIG, 0x00073000 | ctrl->mdll_wake_delay);
2837 
2838 	FOR_ALL_CHANNELS {
2839 		switch (ctrl->rankmap[channel]) {
2840 			/* Unpopulated channel */
2841 		case 0:
2842 			mchbar_write32(PM_CMD_PWR_ch(channel), 0);
2843 			break;
2844 			/* Only single-ranked dimms */
2845 		case 1:
2846 		case 4:
2847 		case 5:
2848 			mchbar_write32(PM_CMD_PWR_ch(channel), 0x00373131);
2849 			break;
2850 			/* Dual-ranked dimms present */
2851 		default:
2852 			mchbar_write32(PM_CMD_PWR_ch(channel), 0x009b6ea1);
2853 			break;
2854 		}
2855 	}
2856 
2857 	mchbar_write32(MEM_TRML_ESTIMATION_CONFIG, 0xca9171e5);
2858 	mchbar_clrsetbits32(MEM_TRML_THRESHOLDS_CONFIG, 0x00ffffff, 0x00e4d5d0);
2859 	mchbar_clrbits32(MEM_TRML_INTERRUPT, 0x1f);
2860 
2861 	FOR_ALL_CHANNELS {
2862 		union tc_rfp_reg tc_rfp = {
2863 			.raw = mchbar_read32(TC_RFP_ch(channel)),
2864 		};
2865 		tc_rfp.refresh_2x_control = 1;
2866 		mchbar_write32(TC_RFP_ch(channel), tc_rfp.raw);
2867 	}
2868 
2869 	mchbar_setbits32(MC_INIT_STATE_G, 1 << 0);
2870 	mchbar_setbits32(MC_INIT_STATE_G, 1 << 7);
2871 
2872 	/* Find a populated channel */
2873 	FOR_ALL_POPULATED_CHANNELS
2874 		break;
2875 
2876 	t1_cycles = (mchbar_read32(TC_ZQCAL_ch(channel)) >> 8) & 0xff;
2877 	r32 = mchbar_read32(PM_DLL_CONFIG);
2878 	if (r32 & (1 << 17))
2879 		t1_cycles += (r32 & 0xfff);
2880 	t1_cycles += mchbar_read32(TC_SRFTP_ch(channel)) & 0xfff;
2881 	t1_ns = t1_cycles * ctrl->tCK / 256 + 544;
2882 	if (!(r32 & (1 << 17)))
2883 		t1_ns += 500;
2884 
2885 	t2_ns = 10 * ((mchbar_read32(SAPMTIMERS) >> 8) & 0xfff);
2886 	if (mchbar_read32(SAPMCTL) & 8) {
2887 		t3_ns  = 10 * ((mchbar_read32(BANDTIMERS_IVB) >> 8) & 0xfff);
2888 		t3_ns += 10 * (mchbar_read32(SAPMTIMERS2_IVB) & 0xff);
2889 	} else {
2890 		t3_ns = 500;
2891 	}
2892 
2893 	/* The graphics driver will use these watermark values */
2894 	printk(BIOS_DEBUG, "t123: %d, %d, %d\n", t1_ns, t2_ns, t3_ns);
2895 	mchbar_clrsetbits32(SSKPD, 0x3f3f3f3f,
2896 		((encode_wm(t1_ns) + encode_wm(t2_ns)) << 16) | (encode_wm(t1_ns) << 8) |
2897 		((encode_wm(t3_ns) + encode_wm(t2_ns) + encode_wm(t1_ns)) << 24) | 0x0c);
2898 }
2899 
restore_timings(ramctr_timing * ctrl)2900 void restore_timings(ramctr_timing *ctrl)
2901 {
2902 	int channel;
2903 
2904 	FOR_ALL_POPULATED_CHANNELS {
2905 		const union tc_rap_reg tc_rap = {
2906 			.tRRD    = ctrl->tRRD,
2907 			.tRTP    = ctrl->tRTP,
2908 			.tCKE    = ctrl->tCKE,
2909 			.tWTR    = ctrl->tWTR,
2910 			.tFAW    = ctrl->tFAW,
2911 			.tWR     = ctrl->tWR,
2912 			.tCMD    = ctrl->cmd_stretch[channel],
2913 		};
2914 		mchbar_write32(TC_RAP_ch(channel), tc_rap.raw);
2915 	}
2916 
2917 	udelay(1);
2918 
2919 	FOR_ALL_POPULATED_CHANNELS {
2920 		wait_for_iosav(channel);
2921 	}
2922 
2923 	FOR_ALL_POPULATED_CHANNELS
2924 		mchbar_setbits32(TC_RWP_ch(channel), 1 << 27);
2925 
2926 	FOR_ALL_POPULATED_CHANNELS {
2927 		udelay(1);
2928 		mchbar_setbits32(SCHED_CBIT_ch(channel), 1 << 21);
2929 	}
2930 
2931 	printram("CPE\n");
2932 
2933 	mchbar_write32(GDCRTRAININGMOD, 0);
2934 	mchbar_write32(IOSAV_DC_MASK, 0);
2935 
2936 	printram("CP5b\n");
2937 
2938 	FOR_ALL_POPULATED_CHANNELS {
2939 		program_timings(ctrl, channel);
2940 	}
2941 
2942 	u32 reg, addr;
2943 
2944 	/* Poll for RCOMP */
2945 	while (!(mchbar_read32(RCOMP_TIMER) & (1 << 16)))
2946 		;
2947 
2948 	do {
2949 		reg = mchbar_read32(IOSAV_STATUS_ch(0));
2950 	} while ((reg & 0x14) == 0);
2951 
2952 	/* Set state of memory controller */
2953 	mchbar_write32(MC_INIT_STATE_G, 0x116);
2954 	mchbar_write32(MC_INIT_STATE, 0);
2955 
2956 	/* Wait 500us */
2957 	udelay(500);
2958 
2959 	FOR_ALL_CHANNELS {
2960 		/* Set valid rank CKE */
2961 		reg = 0;
2962 		reg = (reg & ~0x0f) | ctrl->rankmap[channel];
2963 		addr = MC_INIT_STATE_ch(channel);
2964 		mchbar_write32(addr, reg);
2965 
2966 		/* Wait 10ns for ranks to settle */
2967 		// udelay(0.01);
2968 
2969 		reg = (reg & ~0xf0) | (ctrl->rankmap[channel] << 4);
2970 		mchbar_write32(addr, reg);
2971 
2972 		/* Write reset using a NOP */
2973 		write_reset(ctrl);
2974 	}
2975 
2976 	/* MRS commands */
2977 	dram_mrscommands(ctrl);
2978 
2979 	printram("CP5c\n");
2980 
2981 	mchbar_write32(GDCRTRAININGMOD_ch(0), 0);
2982 
2983 	FOR_ALL_CHANNELS {
2984 		mchbar_clrbits32(GDCRCMDDEBUGMUXCFG_Cz_S(channel), 0x3f << 24);
2985 		udelay(2);
2986 	}
2987 }
2988