diff -u /sys/netinet/tcp_input.c ./netinet_eifel/tcp_input.c --- /sys/netinet/tcp_input.c Tue Apr 20 12:09:15 1999 +++ ./netinet_eifel/tcp_input.c Thu Apr 27 12:38:37 2000 @@ -97,7 +97,12 @@ static void tcp_pulloutofband __P((struct socket *, struct tcpiphdr *, struct mbuf *)); static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); + +#ifdef EIFEL_RTO +static void tcp_xmit_timer __P((struct tcpcb *, short, short)); +#else static void tcp_xmit_timer __P((struct tcpcb *, int)); +#endif /* @@ -557,11 +562,23 @@ */ ++tcpstat.tcps_predack; if ((to.to_flag & TOF_TS) != 0) +#ifdef EIFEL_RTO + tcp_xmit_timer(tp, + tcp_now - to.to_tsecr + 1, + TCP_RTT_SAMPL_EVERY); +#else tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); +#endif else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) +#ifdef EIFEL_RTO + tcp_xmit_timer(tp, + tp->t_rtt, + TCP_RTT_SAMPL_ONCE); +#else tcp_xmit_timer(tp, tp->t_rtt); +#endif acked = ti->ti_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; @@ -1351,6 +1368,71 @@ ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { +#ifdef EIFEL_ALG + if (tp->t_rxtshift == 0) { + /* + * Count this as the first retrans- + * mission. + * This ensures that old_snd_cwnd and + * old_snd_ssthresh won't get over- + * written in tcp_output() should + * another retransmission (caused by + * timeout) occur for this segment. + * Also this provides for the code + * that handels the response to a + * detected spurious timeout to be + * identical to the code that handles + * the response to a detected spurious + * fast retransmit (caused by packet + * re-ordering). I.e., both cases are + * detected and responded to in the + * same way. + */ + tp->t_rxtshift++; + + tp->old_snd_cwnd = + tp->snd_cwnd; + tp->old_snd_ssthresh = + tp->snd_ssthresh; + + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; + + /* ti->ti_ack == tp->snd_una */ + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + } else { + /* + * A (at least one) timeout has + * occured before we got the 3rd + * DUPACK. Thus, cwnd and sstresh have + * already been saved (we don't have + * to save them again) but also + * modified (slow start). Now that we + * got the 3rd DUPACK we know better. + * We can do congestion avoidance + * instead. Therefore, we need to + * first restore cwnd and ssthresh. + */ + tp->snd_cwnd = + tp->old_snd_cwnd; + tp->snd_ssthresh = + tp->old_snd_ssthresh; + } + + tp->snd_nxt = tp->snd_max; + { + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) + / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + } +#else tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / @@ -1368,6 +1450,8 @@ tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; +#endif /* EIFEL_ALG */ + goto drop; } else if (tp->t_dupacks > tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; @@ -1413,6 +1497,77 @@ } process_ACK: + +#ifdef EIFEL_REXMT + /* + * We don't have to remember the timestamps of acked + * segments anymore. However, segments can be acked + * partially so we have to account for that case, too. + */ + if (tp->ts_list) { + while ((tp->ts_list + tp->ts_snd_una)->len && + SEQ_GEQ(ti->ti_ack, + ((tp->ts_list + tp->ts_snd_una)->seq + + (tp->ts_list + tp->ts_snd_una)->len))) { + (tp->ts_list + tp->ts_snd_una)->len = 0; + tp->ts_snd_una++; + if (tp->ts_snd_una >= tp->ts_list_max) + tp->ts_snd_una = 0; + } + } + if (!(tp->ts_list + tp->ts_snd_max)->len) { + bzero((caddr_t)tp->ts_list, tp->ts_list_max + * sizeof(struct ts_list_entry)); + tp->ts_snd_una = tp->ts_snd_max = 0; + } +#endif + +#ifdef EIFEL_ALG + /* + * If timestamps are used and we are waiting for + * an ACK after a retransmission and we detect + * that the retransmission was spurious (caused + * either by a spurious timeout or a spurious + * fast retransmit) then we resume transmission + * off the top. + * In addition, if only a single spurious + * retransmission occured we restore sstresh + * and cwnd to their original values. If two + * spurious timeouts occured we cut cwnd in half, + * and if more than two spurious timeouts occured + * we do nothing (cwnd will remain to be set to + * tp->t_maxseg). I.e., the more spurious timeouts + * occured for that packet, the more in trouble + * the connection is and the more conservative we + * need to be. + * + * Because timestamps can wrap, we use the macro + * that is used for sequence number comparisons. + */ + if (to.to_flag & TOF_TS && + SEQ_LT(tp->snd_nxt, tp->snd_max) && + SEQ_GT(tp->ts_first_rexmit, to.to_tsecr)) { + + tp->snd_nxt = tp->snd_max; + + if (tp->t_rxtshift == 1) { + tp->snd_cwnd = tp->old_snd_cwnd; + tp->snd_ssthresh = tp->old_snd_ssthresh; + } else if (tp->t_rxtshift == 2) { + u_int win = min(tp->snd_wnd, + tp->old_snd_cwnd) + / 2 / tp->t_maxseg; + if (win < 2) { + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = 2 * tp->t_maxseg; + } else { + tp->snd_cwnd = tp->snd_ssthresh = + win * tp->t_maxseg; + } + } + } +#endif /* EIFEL_ALG */ + acked = ti->ti_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; @@ -1427,9 +1582,19 @@ * Recompute the initial retransmit timer. */ if (to.to_flag & TOF_TS) +#ifdef EIFEL_RTO + tcp_xmit_timer(tp, + tcp_now - to.to_tsecr + 1, + TCP_RTT_SAMPL_EVERY); +#else tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); +#endif else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) +#ifdef EIFEL_RTO + tcp_xmit_timer(tp,tp->t_rtt,TCP_RTT_SAMPL_ONCE); +#else tcp_xmit_timer(tp,tp->t_rtt); +#endif /* * If all outstanding data is acked, stop retransmit @@ -1441,7 +1606,26 @@ tp->t_timer[TCPT_REXMT] = 0; needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) +#ifdef EIFEL_REXMT + { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + + /* + * Restart the REXMT with RTO minus the age of + * the oldest unacked sequence number. + */ + if (tp->ts_list && + (tp->ts_list + tp->ts_snd_una)->len && + (tcp_now - (tp->ts_list + tp->ts_snd_una)->ts > 0)) + tp->t_timer[TCPT_REXMT] -= + (tcp_now - (tp->ts_list + tp->ts_snd_una)->ts); + + if (tp->t_timer[TCPT_REXMT] < 1) + tp->t_timer[TCPT_REXMT] = 1; + } +#else tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; +#endif /* * If no data (only SYN) was ACK'd, @@ -1950,6 +2134,244 @@ * Collect new round-trip time estimate * and update averages and current timeout. */ +#ifdef EIFEL_RTO +void +tcp_xmit_timer(tp, rtt, sample) + register struct tcpcb *tp; + short rtt, sample; +{ + /* + * We want to calculate as precise as possible and + * are using large scales. Thus, we need shifter in + * case t_srtt and t_var are only 16 bit values. + */ + register int32_t shifter; + register u_short smooth_gain; + register u_short rttvar_weight = 3; + +#if (PR_SLOWHZ > 2) + register int32_t delta; +#else + register int delta; +#endif + + tcpstat.tcps_rttupdated++; + + /* + * In the following we explain the new features of the + * Eifel-Xmit-Timer: + * + * 1. We don't use the magic numbers (g = 1/8 and h = 1/4) as GAINs. + * These constants cause SRTT and RTTVAR to scale poorly and cause + * the RTO predictor to "fall into" the RTT quickly when the pipe + * capacity gets large. Instead we use 1/(SSTHRESH + 1) (SSTHRESH + * in mulitples of the maximum segment size and "+1" because the + * first flight of packets in congestion avoidance is not SSTHRESH + * but (SSTHRESH + 1)) as the GAIN for both moving averages as + * long as we are network-limited. The motivation for this is that + * we want to weigh the new DELTA proportional to the number of + * RTT samples we get per flight. Thus, (SSTHRESH + 1) is a good + * (conservative) approximation for that (actually, (SSTHRESH + 1) + * can be seen as a smoothed-cwnd-signal). We do not want to use + * the exact number of (samples) packets in flight which is + * (snd_max - snd_una). That signal is too noisy. So far we have + * assumed that every packet is timed. If delayed-ACKs are used we + * multiply the GAIN by 2. However, at this point we have + * not implemented a "delayed-ACK Detection mechanism". Thus, + * even if we receive delayed ACKs we do not multiply the GAIN + * with 2 as we said above. That's just how it should be done. + * If we only get one sample per flight, we use the magic GAIN 1/3. + * + * For the same reasons given for the choice of GAIN, the factor + * with which RTTVAR is weighed in the RTO calculation should be + * the inverse of that GAIN. + * + * 2. We filter out negative DELTAs for calculating RTTVAR. The + * original calculation uses the absolute value of DELTA which + * causes the RTO predictor to fire up when the signal (RTT) goes + * down. This is not what we want and it contradicts basic control + * theory. Instead, we are still conservative and assume a constant + * RTTVAR as long as SRTT is above RTT. + * + * 3. We put in "shock absorbers" to ensure that the RTO reacts quick + * to RTT increases but reacts slow when the RTT decreases. In that + * case we use the square of the gain instead of just the gain. The + * motivation for squaring is again that we want to slow the RTO + * decrease with respect to the number of packets in flight. + * + * 4. We always restart REXMT correctly, i.e., with RTO minus the + * "age" of the oldest outstanding packet. Without that feature + * the REXMT is always off by roughly one RTT depending on various + * factors (e.g.: delayed ACK, and interactive vs. bulk traffic). + * We have implemented this feature as an independent patch called + * EIFEL_REXMT. + * + * INTEGER ARITHMETIC AND SCALES: + * In all our calculations we assume we were running on a machine + * with a 10 ms timer granularity. In that case, however, t_srtt + * and t_rttvar need to be at least 32 bits long. With a 500ms + * granularity 16 bits are sufficient because we only consider RTTs + * of max. 64 seconds which are 128 x 500ms ticks. + * We want the GAIN for RTTVAR and SRTT to scale to an SSTHRESH of + * about 50 x MSS, i.e. more than 100 packets in flight. Beyond that + * the RTO predictor will get more aggressive. Because we are using + * the square of the gain at one point in the formula we need a + * GAIN_SCALE of 2^16. That way (1/50)^2 is still different from + * (1/49)^2 in the integer arithmetic we are using below. + * + * + * More detail can be found in the Eifel-Xmit-Timer Paper. + */ + + /* + * RTT == (rtt - 1): + * There is no point in considering RTTs beyond TCPTV_REXMTMAX. + * Such RTTs are possible! E.g. on wireless links that implement + * very persistent link layer ARQ ;-) + */ + if ((rtt - 1) > TCPTV_REXMTMAX) + rtt = TCPTV_REXMTMAX + 1; + + if ((sample == TCP_RTT_SAMPL_EVERY) || + (sample == TCP_RTT_SAMPL_EVERY_2)) { + /* + * As long as snd_ssthresh has not been touched from its + * initial value, we use snd_cwnd instead. The same is true + * when cwnd grows beyond 2 * ssthresh. + */ + if (tp->snd_ssthresh == (TCP_MAXWIN << TCP_MAX_WINSHIFT)) + rttvar_weight = min(tp->snd_wnd, tp->snd_cwnd) + / 2 / tp->t_maxseg + 1; + else + rttvar_weight = max(min(tp->snd_wnd, tp->snd_cwnd) + / 2 / tp->t_maxseg + 1, + min(tp->snd_wnd, tp->snd_ssthresh) + / tp->t_maxseg + 1); + + /* + * We have to smooth changes of t_wght. Otherwise it + * can happen that the RTO sky rockets when the t_wght + * suddenly increases a lot. The idea here is to change + * t_wght by one (unscaled) per RTT sample we get. + * Actually, this should be made dependent on the RTT + * sampling rate (once per flight, every packet, or + * every 2nd packet) but ... it shouldn't matter too + * much. + */ + if ((tp->t_wght >> TCP_RTT_SHIFT) < rttvar_weight) + tp->t_wght += TCP_RTT_SCALE; + else if ((tp->t_wght >> TCP_RTT_SHIFT) > rttvar_weight) + tp->t_wght -= TCP_RTT_SCALE; + rttvar_weight = tp->t_wght >> TCP_RTT_SHIFT; + + /* + * This has never been tested because we never + * implemented a sender-side delayed-ACK + * detection mechanism. + */ + /* if (sample == TCP_RTT_SAMPL_EVERY_2) + rttvar_weight >> 1; */ + } + + /* + * We also need magic numbers. "3" turns out to be a good + * value with which we are save against spurious timeouts. + */ + if ((rttvar_weight < 3) || (sample == TCP_RTT_SAMPL_ONCE)) + rttvar_weight = 3; + + /* + * With TCP_GAIN_SCALE of 2^16 and a minimum rttvar_weight + * of 3 the maximum smooth_gain is 21845. This is + * important to remember to make sure that we don't get + * overflows. + */ + smooth_gain = TCP_GAIN_SCALE / rttvar_weight; + + if (tp->t_srtt != 0) { + /* + * DELTA = RTT - SRTT + */ + delta = ((rtt - 1) << TCP_RTT_SHIFT) - tp->t_srtt; + + /* + * SRTT += GAIN * DELTA + */ + shifter = smooth_gain * delta; + tp->t_srtt += shifter >> TCP_GAIN_SHIFT; + + /* + * t_srtt cannot be 0 because that is used to indicate + * that no rtt measurement is available yet. + */ + if (tp->t_srtt < 0) + tp->t_srtt = 1; + + /* + * RTTVAR += GAIN * (DELTA - RTTVAR) + * + * BUT: + * A. Filter out negative DELTAs leaving RTTVAR constant + * in that case. + * B. Make the RTO come down slowly + * (the "shock absorber"). + */ + if (delta >= 0) { + if (delta - tp->t_rttvar < 0) + smooth_gain = TCP_GAIN_SCALE + / (rttvar_weight * rttvar_weight); + shifter = smooth_gain * (delta - tp->t_rttvar); + tp->t_rttvar += shifter >> TCP_GAIN_SHIFT; + } + + if (tp->t_rttvar < 0) + tp->t_rttvar = 0; /* no minimum */ + + /* + * RTO = SRTT + 1/GAIN * RTTVAR + */ + shifter = (tp->t_srtt + (rttvar_weight * tp->t_rttvar)) + >> TCP_RTT_SHIFT; + tp->t_rto = shifter; + } else { + /* + * Let's start conservative. The first RTT we get + * is most likely the SYN's RTT. On bandwidth dominated + * paths that RTT can be VERY different from that of + * a data segment! + * SRTT = RTT and RTTVAR = 4 * max(RTT, 1 tick) + */ + tp->t_srtt = (rtt - 1) << TCP_RTT_SHIFT; + tp->t_rttvar = max((rtt - 1), 1) << (TCP_RTTVAR_SHIFT + 2); + } + tp->t_rtt = 0; + tp->t_rxtshift = 0; + + /* + * The minimum for RTO should be RTT + 2 ticks. + * If the RTT was zero (which it often is with + * a timer resolution of 500 ms) we then get + * a minimum of 500 to 1000 ms because of the + * heartbeat timer which expires somewhere + * between 0 and 500 ms. + * + * Remember RTT = (rtt - 1) + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + (rtt + 1), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +#else /* EIFEL_RTO */ + static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; @@ -2024,6 +2446,8 @@ tp->t_softerror = 0; } +#endif /* EIFEL_RTO */ + /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. @@ -2186,6 +2610,26 @@ } tp->t_maxseg = mss; +#ifdef EIFEL_REXMT + /* + * This array holds the timestamps of every unacked segment. + */ + tp->ts_list_max = so->so_snd.sb_hiwat / tp->t_maxseg; + if (tp->ts_list) + free((void *)tp->ts_list, M_TEMP); + tp->ts_list = + (struct ts_list_entry*)malloc(tp->ts_list_max + * sizeof(struct ts_list_entry), + M_TEMP, + M_DONTWAIT); + if (tp->ts_list) + bzero((caddr_t)tp->ts_list, tp->ts_list_max + * sizeof(struct ts_list_entry)); + else + tp->ts_list_max = 0; + tp->ts_snd_una = tp->ts_snd_max = 0; +#endif + #ifdef RTV_RPIPE if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) #endif @@ -2229,3 +2673,4 @@ return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); } + diff -u /sys/netinet/tcp_output.c ./netinet_eifel/tcp_output.c --- /sys/netinet/tcp_output.c Wed Apr 7 15:25:52 1999 +++ ./netinet_eifel/tcp_output.c Wed Oct 13 06:28:22 1999 @@ -62,6 +62,9 @@ #ifdef TCPDEBUG #include #endif +#ifdef EIFEL_REXMT +#include +#endif #ifdef notyet extern struct mbuf *m_copypack(); @@ -356,6 +359,17 @@ (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); +#ifdef EIFEL_ALG + /* + * Remember the timestamp of the FIRST retransmission. + * This allows us to detect spurious retransmissions + * later. + */ + if (SEQ_LT(tp->snd_nxt, tp->snd_max) && tp->t_rxtshift == 1) { + tp->ts_first_rexmit = tcp_now; + } +#endif + /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(tcp_now); @@ -567,6 +581,143 @@ ti->ti_seq = htonl(tp->snd_nxt); else ti->ti_seq = htonl(tp->snd_max); + +#ifdef EIFEL_REXMT + /* + * Remember the exact time of when each segment is sent. + * This is needed when we *re-* start the REXMT timer for + * every ACK for new data in tcp_input(). + * + * We do this remembering in an array. If this is a + * retransmission we do not need to create a new entry + * but have to update the appropriate list entry. If we + * don't find that entry, something weird happened + * (repacketization?) and we mark the entire ts_list + * as invalid. It will then be rebuild with the next + * new segments. Until then we have to do without the + * feature of restarting REXMT precisely, i.e., REXMT + * will be more conservative which is not so bad. The same + * happens if we don't get memory. + */ + if (tp->ts_list && len) { + /* + * The last malloc() for tp->ts_list succeeded, and + * the segment carries data, i.e. it is a segment + * that might need to be retransmitted. + */ + if (SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { + /* + * This is not a retransmission. + */ + if ( ((tp->ts_snd_max + 1) % tp->ts_list_max) == + tp->ts_snd_una) { + /* + * tp->ts_list is full. So make it twice as + * large and keep the timestamps of those + * packets that are currently unacked. + */ + struct ts_list_entry *ts_new_list, *tle; + size_t size = 2 * tp->ts_list_max * sizeof(*tle); + + tle = ts_new_list = (struct ts_list_entry *) + malloc(size, M_TEMP, M_DONTWAIT); + if (ts_new_list == NULL) { + free(tp->ts_list, M_TEMP); + tp->ts_list = NULL; + tp->ts_list_max = 0; + tp->ts_snd_una = tp->ts_snd_max = 0; + } else { + bzero((caddr_t)ts_new_list, size); + while (tp->ts_snd_una != + ((tp->ts_snd_max +1) % tp->ts_list_max)) { + *tle++ = tp->ts_list[tp->ts_snd_una++]; + if (tp->ts_snd_una >= tp->ts_list_max) + tp->ts_snd_una = 0; + } + free(tp->ts_list, M_TEMP); + tp->ts_list = ts_new_list; + tp->ts_snd_una = 0; + tp->ts_snd_max = tp->ts_list_max - 1; + tp->ts_list_max = 2 * tp->ts_list_max; + } + } + + if (tp->ts_list) { + /* + * Doubling the size of tp->ts_list went OK and/or + * we have a free entry above ts_snd_max. + */ + if ((tp->ts_list + tp->ts_snd_max)->len) { + /* + * tp->ts_list is not empty. + */ + tp->ts_snd_max++; + if (tp->ts_snd_max >= tp->ts_list_max) + tp->ts_snd_max = 0; + } else + /* + * This is the first valid entry into + * tp->ts_list. + */ + tp->ts_snd_una = tp->ts_snd_max = 0; + (tp->ts_list + tp->ts_snd_max)->seq = + ntohl(ti->ti_seq); + (tp->ts_list + tp->ts_snd_max)->len = len; + (tp->ts_list + tp->ts_snd_max)->ts = tcp_now; + } + } else { + /* + * This is a retransmission and tp->ts_list is not + * NULL. Find the entry that holds the timestamp of + * the segment we are about to retransmit. Usually, + * this is *(tp->ts_list + tp->ts_snd_una) but TCP + * can do a go-back-N (e.g., after a spurious timeout + * and the Eifel Algorithm is not used) so in that + * case we have to look for the right entry and + * update it. + */ + if ((tp->ts_list + tp->ts_snd_una)->len) { + /* + * Do this only when tp->ts_list was not marked + * as invalid on a pass through this branch + * before. + */ + u_short idx = tp->ts_snd_una; + while ((idx != ((tp->ts_snd_max + 1) + % tp->ts_list_max)) && + SEQ_LT((tp->ts_list + idx)->seq, + tp->snd_nxt)) { + idx++; + if (idx >= tp->ts_list_max) + idx = 0; + } + + if ( (idx == ((tp->ts_snd_max + 1) + % tp->ts_list_max)) || + ( ((tp->ts_list + idx)->seq != + tp->snd_nxt) || + ((tp->ts_list + idx)->len != len) ) ) { + /* + * If the entry was not found or does + * not match the sequence number and + * length of the segment we are going + * to retransmit, something weird + * happened (repacketization?) and we + * mark tp->ts_list as invalid. It + * will get rebuild with the next new + * packets. + */ + bzero((caddr_t)tp->ts_list, + tp->ts_list_max + * sizeof(struct ts_list_entry)); + tp->ts_snd_una = tp->ts_snd_max = 0; + } else if ((tp->ts_list + idx)->len) + (tp->ts_list + idx)->ts = tcp_now; + } + } + } +#endif + ti->ti_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, ti + 1, optlen); @@ -754,4 +905,9 @@ TCPTV_PERSMIN, TCPTV_PERSMAX); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; +#ifdef EIFEL_FOREVER + if (tp->t_rxtshift == TCP_MAXRXTSHIFT) + tp->t_rxtshift--; +#endif } + diff -u /sys/netinet/tcp_subr.c ./netinet_eifel/tcp_subr.c --- /sys/netinet/tcp_subr.c Wed Feb 3 22:40:28 1999 +++ ./netinet_eifel/tcp_subr.c Wed Oct 13 05:58:36 1999 @@ -311,6 +311,12 @@ */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + +#ifdef EIFEL_RTO + tp->t_wght = 3 << TCP_RTT_SHIFT; + tp->t_rto = TCPTV_RTOBASE; +#endif + tp->t_rttmin = TCPTV_MIN; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; @@ -361,6 +367,11 @@ register struct rtentry *rt; int dosavessthresh; +#ifdef EIFEL_REXMT + if (tp->ts_list) + free(tp->ts_list, M_TEMP); +#endif + /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. @@ -759,3 +770,4 @@ tcp_cleartaocache() { } + diff -u /sys/netinet/tcp_timer.c ./netinet_eifel/tcp_timer.c --- /sys/netinet/tcp_timer.c Fri Apr 24 02:25:35 1998 +++ ./netinet_eifel/tcp_timer.c Wed Oct 13 06:14:35 1999 @@ -226,6 +226,10 @@ * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: +#ifdef EIFEL_FOREVER + if (tp->t_rxtshift == TCP_MAXRXTSHIFT) + tp->t_rxtshift--; +#endif if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; tcpstat.tcps_timeoutdrop++; @@ -251,6 +255,12 @@ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } +#ifdef EIFEL_ALG + if (tp->t_rxtshift == 1) { + tp->old_snd_cwnd = tp->snd_cwnd; + tp->old_snd_ssthresh = tp->snd_ssthresh; + } +#endif tp->snd_nxt = tp->snd_una; /* * Force a segment to be sent. @@ -308,6 +318,10 @@ * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ +#ifdef EIFEL_FOREVER + if (tp->t_rxtshift == TCP_MAXRXTSHIFT) + tp->t_rxtshift--; +#endif if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (tp->t_idle >= tcp_maxpersistidle || tp->t_idle >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { @@ -369,3 +383,4 @@ } return (tp); } + diff -u /sys/netinet/tcp_var.h ./netinet_eifel/tcp_var.h --- /sys/netinet/tcp_var.h Wed Jan 20 09:32:00 1999 +++ ./netinet_eifel/tcp_var.h Wed Oct 13 05:58:36 1999 @@ -40,7 +40,15 @@ * Kernel variables for tcp. */ -/* +#ifdef EIFEL_REXMT +struct ts_list_entry { + tcp_seq seq; + long len; + u_long ts; +}; +#endif + + /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. */ @@ -96,6 +104,12 @@ * for slow start exponential to * linear switch */ +#ifdef EIFEL_ALG + u_long old_snd_cwnd; + u_long old_snd_ssthresh; + u_long ts_first_rexmit; +#endif + u_int t_maxopd; /* mss plus options */ u_int t_idle; /* inactivity time */ @@ -105,8 +119,19 @@ int t_rxtcur; /* current retransmit value */ u_int t_maxseg; /* maximum segment size */ - int t_srtt; /* smoothed round-trip time */ + +#ifdef EIFEL_RTO +#if (PR_SLOWHZ > 2) + int32_t t_srtt; /* smoothed round-trip time */ + int32_t t_rttvar; /* variance in round-trip time */ +#else + int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ +#endif +#else + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ +#endif int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ @@ -114,6 +139,32 @@ u_long max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ + +#ifdef EIFEL_REXMT + struct ts_list_entry *ts_list; + u_short ts_snd_una; + u_short ts_snd_max; + u_short ts_list_max; +#endif + +#ifdef EIFEL_RTO +/* + * We remember the RTO of the connection even if that is redundant info + * as it can be recomputed from t_srtt, t_rttvar, and t_wght. + * However, it makes the code more readable! + */ + short t_rto; + +/* + * The weight factor for RTTVAR in the calculation of RTO: + */ +#if (PR_SLOWHZ > 2) + int32_t t_wght; +#else + short t_wght; +#endif +#endif + /* out-of-band data */ char t_oobflags; /* have some */ char t_iobc; /* input character */ @@ -171,6 +222,23 @@ #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) +#ifdef EIFEL_RTO + +#define TCP_RTT_SAMPL_ONCE 0 /* RTT is measured once per flight */ +#define TCP_RTT_SAMPL_EVERY 1 /* RTT is measured for every packet */ +#define TCP_RTT_SAMPL_EVERY_2 2 /* with delayed ACKs */ + +#define TCP_RTT_SCALE 64 +#define TCP_RTT_SHIFT 6 +#define TCP_RTTVAR_SCALE 64 +#define TCP_RTTVAR_SHIFT 6 +#define TCP_GAIN_SCALE 65536 +#define TCP_GAIN_SHIFT 16 + +#define TCP_REXMTVAL(tp) (tp)->t_rto + +#else + /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. @@ -206,6 +274,8 @@ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) +#endif /* EIFEL_RTO */ + /* * TCP statistics. * Many of these should be kept per connection, @@ -369,3 +439,4 @@ #endif /* KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ +