aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorosdl.org!shemminger <osdl.org!shemminger>2004-04-15 20:56:59 +0000
committerosdl.org!shemminger <osdl.org!shemminger>2004-04-15 20:56:59 +0000
commitaba5acdfdb347d2c21fc67d613d83d4430ca3937 (patch)
tree20a89d844444d062bac7e2a945251068f8e39d18
parent86fdf0e47be697587efcf9602cd1f952a1d73170 (diff)
downloadplatform_external_iproute2-aba5acdfdb347d2c21fc67d613d83d4430ca3937.tar.gz
platform_external_iproute2-aba5acdfdb347d2c21fc67d613d83d4430ca3937.tar.bz2
platform_external_iproute2-aba5acdfdb347d2c21fc67d613d83d4430ca3937.zip
(Logical change 1.3)
-rw-r--r--Config2
-rw-r--r--Makefile77
-rw-r--r--Modules/Catalogue7
-rw-r--r--Modules/tcp_diag.c623
-rw-r--r--Patches/Catalogue46
-rw-r--r--Patches/af_unix.dif401
-rw-r--r--Patches/pidentd-3.0.12.dif270
-rw-r--r--Patches/rt_cache_stat.dif230
-rw-r--r--Patches/symbol_exports.dif56
-rw-r--r--README65
-rw-r--r--README.decnet41
-rw-r--r--README.iproute2+tc119
-rw-r--r--RELNOTES168
-rw-r--r--doc/Makefile57
-rw-r--r--doc/Plan16
-rw-r--r--doc/SNAPSHOT.tex1
-rw-r--r--doc/api-ip6-flowlabels.tex429
-rw-r--r--doc/arpd.sgml130
-rw-r--r--doc/do-psnup16
-rw-r--r--doc/ip-cref.tex3316
-rw-r--r--doc/ip-tunnels.tex469
-rw-r--r--doc/nstat.sgml110
-rw-r--r--doc/preamble.tex26
-rw-r--r--doc/rtstat.sgml52
-rw-r--r--doc/ss.sgml525
-rw-r--r--etc/iproute2/rt_dsfield13
-rw-r--r--etc/iproute2/rt_protos25
-rw-r--r--etc/iproute2/rt_realms13
-rw-r--r--etc/iproute2/rt_scopes11
-rw-r--r--etc/iproute2/rt_tables11
-rw-r--r--examples/SYN-DoS.rate.limit49
-rw-r--r--examples/cbqinit.eth176
-rw-r--r--examples/dhcp-client-script446
-rw-r--r--examples/diffserv/Edge168
-rw-r--r--examples/diffserv/Edge287
-rw-r--r--examples/diffserv/Edge31-ca-u32170
-rw-r--r--examples/diffserv/Edge31-cb-chains132
-rw-r--r--examples/diffserv/Edge32-ca-u32198
-rw-r--r--examples/diffserv/Edge32-cb-chains144
-rw-r--r--examples/diffserv/Edge32-cb-u32145
-rw-r--r--examples/diffserv/README98
-rw-r--r--examples/diffserv/afcbq105
-rw-r--r--examples/diffserv/ef-prio25
-rw-r--r--examples/diffserv/efcbq31
-rw-r--r--examples/diffserv/regression-testing125
-rw-r--r--include-glibc/bits/sockunion.h25
-rw-r--r--include-glibc/db.h10
-rw-r--r--include-glibc/glibc-bugs.h20
-rw-r--r--include-glibc/netinet/in.h11
-rw-r--r--include-glibc/netinet/ip.h9
-rw-r--r--include-glibc/socketbits.h270
-rw-r--r--include/SNAPSHOT.h1
-rw-r--r--include/libnetlink.h46
-rw-r--r--include/ll_map.h12
-rw-r--r--include/rt_names.h28
-rw-r--r--include/rtm_map.h10
-rw-r--r--include/tcp_diag.h119
-rw-r--r--include/utils.h104
-rw-r--r--ip/Makefile22
-rw-r--r--ip/ifcfg145
-rw-r--r--ip/ip.c167
-rw-r--r--ip/ip_common.h20
-rw-r--r--ip/ipaddress.c898
-rw-r--r--ip/iplink.c397
-rw-r--r--ip/ipmaddr.c342
-rw-r--r--ip/ipmonitor.c152
-rw-r--r--ip/ipmroute.c204
-rw-r--r--ip/ipneigh.c484
-rw-r--r--ip/iproute.c1410
-rw-r--r--ip/iprule.c323
-rw-r--r--ip/iptunnel.c581
-rw-r--r--ip/routef3
-rw-r--r--ip/routel60
-rw-r--r--ip/rtm_map.c116
-rw-r--r--ip/rtmon.c177
-rw-r--r--ip/rtpr4
-rw-r--r--lib/Makefile18
-rw-r--r--lib/dnet_ntop.c98
-rw-r--r--lib/dnet_pton.c71
-rw-r--r--lib/inet_ntop.c199
-rw-r--r--lib/inet_proto.c70
-rw-r--r--lib/inet_pton.c217
-rw-r--r--lib/ipx_ntop.c71
-rw-r--r--lib/ipx_pton.c107
-rw-r--r--lib/libnetlink.c521
-rw-r--r--lib/ll_addr.c91
-rw-r--r--lib/ll_map.c169
-rw-r--r--lib/ll_proto.c127
-rw-r--r--lib/ll_types.c128
-rw-r--r--lib/rt_names.c388
-rw-r--r--lib/utils.c528
-rw-r--r--misc/Makefile37
-rw-r--r--misc/arpd.c846
-rw-r--r--misc/ifstat.c729
-rw-r--r--misc/netbug53
-rw-r--r--misc/nstat.c614
-rw-r--r--misc/rtacct.c625
-rw-r--r--misc/rtstat.c172
-rw-r--r--misc/ss.c2672
-rw-r--r--misc/ssfilter.h21
-rw-r--r--misc/ssfilter.y274
-rw-r--r--tc/Makefile54
-rw-r--r--tc/README.last47
-rw-r--r--tc/f_fw.c116
-rw-r--r--tc/f_route.c175
-rw-r--r--tc/f_rsvp.c408
-rw-r--r--tc/f_tcindex.c186
-rw-r--r--tc/f_u32.c977
-rw-r--r--tc/m_estimator.c64
-rw-r--r--tc/m_police.c328
-rw-r--r--tc/q_atm.c268
-rw-r--r--tc/q_cbq.c555
-rw-r--r--tc/q_csz.c61
-rw-r--r--tc/q_dsmark.c186
-rw-r--r--tc/q_fifo.c101
-rw-r--r--tc/q_gred.c345
-rw-r--r--tc/q_hfsc.c61
-rw-r--r--tc/q_hpfq.c61
-rw-r--r--tc/q_ingress.c76
-rw-r--r--tc/q_prio.c127
-rw-r--r--tc/q_red.c222
-rw-r--r--tc/q_sfq.c115
-rw-r--r--tc/q_tbf.c272
-rw-r--r--tc/tc.c306
-rw-r--r--tc/tc_cbq.c57
-rw-r--r--tc/tc_cbq.h9
-rw-r--r--tc/tc_class.c361
-rw-r--r--tc/tc_common.h5
-rw-r--r--tc/tc_core.c85
-rw-r--r--tc/tc_core.h16
-rw-r--r--tc/tc_estimator.c44
-rw-r--r--tc/tc_filter.c388
-rw-r--r--tc/tc_qdisc.c353
-rw-r--r--tc/tc_red.c97
-rw-r--r--tc/tc_red.h8
-rw-r--r--tc/tc_util.c313
-rw-r--r--tc/tc_util.h57
137 files changed, 31144 insertions, 0 deletions
diff --git a/Config b/Config
index e69de29b..ca6cdcea 100644
--- a/Config
+++ b/Config
@@ -0,0 +1,2 @@
+TC_CONFIG_DIFFSERV=n
+TC_CONFIG_ATM=n
diff --git a/Makefile b/Makefile
index e69de29b..05063e77 100644
--- a/Makefile
+++ b/Makefile
@@ -0,0 +1,77 @@
+# Path to parent kernel include files directory
+DESTDIR=
+SBINDIR=/sbin
+CONFDIR=/etc/iproute2
+DOCDIR=/usr/doc/iproute2
+
+KERNEL_INCLUDE=/usr/src/linux/include
+LIBC_INCLUDE=/usr/include
+
+DEFINES= -DRESOLVE_HOSTNAMES
+
+#options if you have a bind>=4.9.4 libresolv (or, maybe, glibc)
+LDLIBS=-lresolv
+ADDLIB=
+
+#options if you compile with libc5, and without a bind>=4.9.4 libresolv
+#LDLIBS=
+#ADDLIB=inet_ntop.o inet_pton.o
+
+#options for decnet
+ADDLIB+=dnet_ntop.o dnet_pton.o
+
+#options for ipx
+ADDLIB+=ipx_ntop.o ipx_pton.o
+
+ifeq ($(LIBC_INCLUDE)/socketbits.h,$(wildcard $(LIBC_INCLUDE)/socketbits.h))
+ ifeq ($(LIBC_INCLUDE)/net/if_packet.h,$(wildcard $(LIBC_INCLUDE)/net/if_packet.h))
+ GLIBCFIX=-I../include-glibc -include ../include-glibc/glibc-bugs.h
+ endif
+endif
+ifeq ($(LIBC_INCLUDE)/bits/socket.h,$(wildcard $(LIBC_INCLUDE)/bits/socket.h))
+ GLIBCFIX=-I../include-glibc -I/usr/include/db3 -include ../include-glibc/glibc-bugs.h
+endif
+
+
+CC = gcc
+CCOPTS = -D_GNU_SOURCE -O2 -Wstrict-prototypes -Wall -g
+CFLAGS = $(CCOPTS) $(GLIBCFIX) -I$(KERNEL_INCLUDE) -I../include $(DEFINES)
+
+LDLIBS += -L../lib -lnetlink -lutil
+
+SUBDIRS=lib ip tc misc
+
+LIBNETLINK=../lib/libnetlink.a ../lib/libutil.a
+
+all: check-kernel
+ @set -e; \
+ for i in $(SUBDIRS); \
+ do $(MAKE) -C $$i; done
+
+check-kernel:
+ifeq ($(KERNEL_INCLUDE),)
+ @echo "Please, set correct KERNEL_INCLUDE"; false
+else
+ @set -e; \
+ if [ ! -r $(KERNEL_INCLUDE)/linux/autoconf.h ]; then \
+ echo "Please, compile the kernel first"; false; fi
+endif
+
+install: all
+ install -m 0755 -d $(DESTDIR)$(SBINDIR)
+ install -m 0755 -d $(DESTDIR)$(CONFDIR)
+ install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples
+ install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples/diffserv
+ install -m 0644 README.iproute2+tc $(shell find examples -type f -maxdepth 1) $(DESTDIR)$(DOCDIR)/examples
+ install -m 0644 $(shell echo examples/diffserv/*) $(DESTDIR)$(DOCDIR)/examples/diffserv
+ @for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done
+ @cd etc/iproute2; for i in *; do \
+ if [ ! -e $(DESTDIR)$(CONFDIR)/$$i ]; then \
+ echo install -m 0644 $$i $(DESTDIR)$(CONFDIR); \
+ install -m 0644 $$i $(DESTDIR)$(CONFDIR); fi; done
+
+clean:
+ for i in $(SUBDIRS) doc; \
+ do $(MAKE) -C $$i clean; done
+
+.EXPORT_ALL_VARIABLES:
diff --git a/Modules/Catalogue b/Modules/Catalogue
index e69de29b..e5d2d0f2 100644
--- a/Modules/Catalogue
+++ b/Modules/Catalogue
@@ -0,0 +1,7 @@
+File: tcp_diag.c
+Status: desired for kernels < 2.4.17
+ not needed for kernels >= 2.4.17
+Description: adds tcpdiag facility to kernel to accelerate ss utility
+ and pidentd
+Side effects: none
+ \ No newline at end of file
diff --git a/Modules/tcp_diag.c b/Modules/tcp_diag.c
index e69de29b..e11e221d 100644
--- a/Modules/tcp_diag.c
+++ b/Modules/tcp_diag.c
@@ -0,0 +1,623 @@
+/*
+ * tcp_diag.c Module for monitoring TCP sockets.
+ *
+ * Version: $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include "tcp_diag.h"
+
+static struct sock *tcpnl;
+
+
+#define TCPDIAG_PUT(skb, attrtype, attrlen) \
+({ int rtalen = RTA_LENGTH(attrlen); \
+ struct rtattr *rta; \
+ if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
+ rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
+ rta->rta_type = attrtype; \
+ rta->rta_len = rtalen; \
+ RTA_DATA(rta); })
+
+static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
+ int ext, u32 pid, u32 seq)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct tcpdiagmsg *r;
+ struct nlmsghdr *nlh;
+ struct tcp_info *info = NULL;
+ struct tcpdiag_meminfo *minfo = NULL;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+ r = NLMSG_DATA(nlh);
+ if (sk->state != TCP_TIME_WAIT) {
+ if (ext & (1<<(TCPDIAG_MEMINFO-1)))
+ minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
+ if (ext & (1<<(TCPDIAG_INFO-1)))
+ info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
+ }
+ r->tcpdiag_family = sk->family;
+ r->tcpdiag_state = sk->state;
+ r->tcpdiag_timer = 0;
+ r->tcpdiag_retrans = 0;
+
+ r->id.tcpdiag_sport = sk->sport;
+ r->id.tcpdiag_dport = sk->dport;
+ r->id.tcpdiag_src[0] = sk->rcv_saddr;
+ r->id.tcpdiag_dst[0] = sk->daddr;
+ r->id.tcpdiag_if = sk->bound_dev_if;
+ *((struct sock **)&r->id.tcpdiag_cookie) = sk;
+
+ if (r->tcpdiag_state == TCP_TIME_WAIT) {
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
+ long tmo = tw->ttd - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->tcpdiag_state = tw->substate;
+ r->tcpdiag_timer = 3;
+ r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
+ r->tcpdiag_rqueue = 0;
+ r->tcpdiag_wqueue = 0;
+ r->tcpdiag_uid = 0;
+ r->tcpdiag_inode = 0;
+#ifdef CONFIG_IPV6
+ if (r->tcpdiag_family == AF_INET6) {
+ memcpy(r->id.tcpdiag_src, &tw->v6_rcv_saddr, 16);
+ memcpy(r->id.tcpdiag_dst, &tw->v6_daddr, 16);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+ }
+
+#ifdef CONFIG_IPV6
+ if (r->tcpdiag_family == AF_INET6) {
+ memcpy(r->id.tcpdiag_src, &sk->net_pinfo.af_inet6.rcv_saddr, 16);
+ memcpy(r->id.tcpdiag_dst, &sk->net_pinfo.af_inet6.daddr, 16);
+ }
+#endif
+
+#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
+
+ if (tp->pending == TCP_TIME_RETRANS) {
+ r->tcpdiag_timer = 1;
+ r->tcpdiag_retrans = tp->retransmits;
+ r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+ } else if (tp->pending == TCP_TIME_PROBE0) {
+ r->tcpdiag_timer = 4;
+ r->tcpdiag_retrans = tp->probes_out;
+ r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+ } else if (timer_pending(&sk->timer)) {
+ r->tcpdiag_timer = 2;
+ r->tcpdiag_retrans = tp->probes_out;
+ r->tcpdiag_expires = EXPIRES_IN_MS(sk->timer.expires);
+ } else {
+ r->tcpdiag_timer = 0;
+ r->tcpdiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+ r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
+ r->tcpdiag_uid = sock_i_uid(sk);
+ r->tcpdiag_inode = sock_i_ino(sk);
+
+ if (minfo) {
+ minfo->tcpdiag_rmem = atomic_read(&sk->rmem_alloc);
+ minfo->tcpdiag_wmem = sk->wmem_queued;
+ minfo->tcpdiag_fmem = sk->forward_alloc;
+ minfo->tcpdiag_tmem = atomic_read(&sk->wmem_alloc);
+ }
+
+ if (info) {
+ u32 now = tcp_time_stamp;
+
+ info->tcpi_state = sk->state;
+ info->tcpi_ca_state = tp->ca_state;
+ info->tcpi_retransmits = tp->retransmits;
+ info->tcpi_probes = tp->probes_out;
+ info->tcpi_backoff = tp->backoff;
+ info->tcpi_options = 0;
+ if (tp->tstamp_ok)
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tp->sack_ok)
+ info->tcpi_options |= TCPI_OPT_SACK;
+ if (tp->wscale_ok) {
+ info->tcpi_options |= TCPI_OPT_WSCALE;
+ info->tcpi_snd_wscale = tp->snd_wscale;
+ info->tcpi_rcv_wscale = tp->rcv_wscale;
+ } else {
+ info->tcpi_snd_wscale = 0;
+ info->tcpi_rcv_wscale = 0;
+ }
+#ifdef CONFIG_INET_ECN
+ if (tp->ecn_flags&TCP_ECN_OK)
+ info->tcpi_options |= TCPI_OPT_ECN;
+#endif
+
+ info->tcpi_rto = (1000000*tp->rto)/HZ;
+ info->tcpi_ato = (1000000*tp->ack.ato)/HZ;
+ info->tcpi_snd_mss = tp->mss_cache;
+ info->tcpi_rcv_mss = tp->ack.rcv_mss;
+
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+ info->tcpi_lost = tp->lost_out;
+ info->tcpi_retrans = tp->retrans_out;
+ info->tcpi_fackets = tp->fackets_out;
+
+ info->tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
+ info->tcpi_last_ack_sent = 0;
+ info->tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
+ info->tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
+
+ info->tcpi_pmtu = tp->pmtu_cookie;
+ info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+ info->tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
+ info->tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
+ info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_advmss = tp->advmss;
+ info->tcpi_reordering = tp->reordering;
+ }
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+#ifdef CONFIG_IPV6
+extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+ struct in6_addr *daddr, u16 dport,
+ int dif);
+#endif
+
+static int tcpdiag_get_exact(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock *sk;
+ struct tcpdiagreq *req = NLMSG_DATA(nlh);
+ struct sk_buff *rep;
+
+ if (req->tcpdiag_family == AF_INET) {
+ sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
+ req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
+ req->id.tcpdiag_if);
+ }
+#ifdef CONFIG_IPV6
+ else if (req->tcpdiag_family == AF_INET6) {
+ sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
+ (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
+ req->id.tcpdiag_if);
+ }
+#endif
+ else {
+ return -EINVAL;
+ }
+
+ if (sk == NULL)
+ return -ENOENT;
+
+ err = -ESTALE;
+ if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
+ req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
+ sk != *((struct sock **)&req->id.tcpdiag_cookie[0]))
+ goto out;
+
+ err = -ENOMEM;
+ rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
+ sizeof(struct tcpdiag_meminfo)+
+ sizeof(struct tcp_info)+64), GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
+ NETLINK_CB(in_skb).pid,
+ nlh->nlmsg_seq) <= 0)
+ BUG();
+
+ err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk) {
+ if (sk->state == TCP_TIME_WAIT)
+ tcp_tw_put((struct tcp_tw_bucket*)sk);
+ else
+ sock_put(sk);
+ }
+ return err;
+}
+
+int bitstring_match(u32 *a1, u32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __u32 w1, w2;
+ __u32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+
+int tcpdiag_bc_run(char *bc, int len, struct sock *sk)
+{
+ while (len > 0) {
+ int yes = 1;
+ struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
+
+ switch (op->code) {
+ case TCPDIAG_BC_NOP:
+ break;
+ case TCPDIAG_BC_JMP:
+ yes = 0;
+ break;
+ case TCPDIAG_BC_S_GE:
+ yes = (sk->num >= op[1].no);
+ break;
+ case TCPDIAG_BC_S_LE:
+ yes = (sk->num <= op[1].no);
+ break;
+ case TCPDIAG_BC_D_GE:
+ yes = (ntohs(sk->dport) >= op[1].no);
+ break;
+ case TCPDIAG_BC_D_LE:
+ yes = (ntohs(sk->dport) <= op[1].no);
+ break;
+ case TCPDIAG_BC_AUTO:
+ yes = !(sk->userlocks&SOCK_BINDPORT_LOCK);
+ break;
+ case TCPDIAG_BC_S_COND:
+ case TCPDIAG_BC_D_COND:
+ {
+ struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
+ u32 *addr;
+
+ if (cond->port != -1 &&
+ cond->port != (op->code == TCPDIAG_BC_S_COND ? sk->num : ntohs(sk->dport))) {
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+
+ if (sk->family == AF_INET6) {
+ if (op->code == TCPDIAG_BC_S_COND)
+ addr = (u32*)&sk->net_pinfo.af_inet6.rcv_saddr;
+ else
+ addr = (u32*)&sk->net_pinfo.af_inet6.daddr;
+ } else {
+ if (op->code == TCPDIAG_BC_S_COND)
+ addr = &sk->rcv_saddr;
+ else
+ addr = &sk->daddr;
+ }
+
+ if (bitstring_match(addr, cond->addr, cond->prefix_len))
+ break;
+ if (sk->family == AF_INET6 && cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == __constant_htonl(0xffff) &&
+ bitstring_match(addr+3, cond->addr, cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return (len == 0);
+}
+
+int valid_cc(char *bc, int len, int cc)
+{
+ while (len >= 0) {
+ struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+int tcpdiag_bc_audit(char *bytecode, int bytecode_len)
+{
+ char *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+ switch (op->code) {
+ case TCPDIAG_BC_AUTO:
+ case TCPDIAG_BC_S_COND:
+ case TCPDIAG_BC_D_COND:
+ case TCPDIAG_BC_S_GE:
+ case TCPDIAG_BC_S_LE:
+ case TCPDIAG_BC_D_GE:
+ case TCPDIAG_BC_D_LE:
+ if (op->yes < 4 || op->yes > len+4)
+ return -EINVAL;
+ case TCPDIAG_BC_JMP:
+ if (op->no < 4 || op->no > len+4)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len-op->no))
+ return -EINVAL;
+ break;
+ case TCPDIAG_BC_NOP:
+ if (op->yes < 4 || op->yes > len+4)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+
+int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, num;
+ int s_i, s_num;
+ struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+ struct rtattr *bc = NULL;
+
+ if (cb->nlh->nlmsg_len > 4+NLMSG_SPACE(sizeof(struct tcpdiagreq)))
+ bc = (struct rtattr*)(r+1);
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+ tcp_listen_lock();
+ for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
+ struct sock *sk = tcp_listening_hash[i];
+
+ if (i > s_i)
+ s_num = 0;
+
+ for (sk = tcp_listening_hash[i], num = 0;
+ sk != NULL;
+ sk = sk->next, num++) {
+ if (num < s_num)
+ continue;
+ if (!(r->tcpdiag_states&TCPF_LISTEN) ||
+ r->id.tcpdiag_dport)
+ continue;
+ if (r->id.tcpdiag_sport != sk->sport && r->id.tcpdiag_sport)
+ continue;
+ if (bc && !tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), sk))
+ continue;
+ if (tcpdiag_fill(skb, sk, r->tcpdiag_ext,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq) <= 0) {
+ tcp_listen_unlock();
+ goto done;
+ }
+ }
+ }
+ tcp_listen_unlock();
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
+ return skb->len;
+
+ for (i = s_i; i < tcp_ehash_size; i++) {
+ struct tcp_ehash_bucket *head = &tcp_ehash[i];
+ struct sock *sk;
+
+ if (i > s_i)
+ s_num = 0;
+
+ read_lock_bh(&head->lock);
+
+ for (sk = head->chain, num = 0;
+ sk != NULL;
+ sk = sk->next, num++) {
+ if (num < s_num)
+ continue;
+ if (!(r->tcpdiag_states&(1<<sk->state)))
+ continue;
+ if (r->id.tcpdiag_sport != sk->sport && r->id.tcpdiag_sport)
+ continue;
+ if (r->id.tcpdiag_dport != sk->dport && r->id.tcpdiag_dport)
+ continue;
+ if (bc && !tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), sk))
+ continue;
+ if (tcpdiag_fill(skb, sk, r->tcpdiag_ext,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq) <= 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+ }
+
+ if (r->tcpdiag_states&TCPF_TIME_WAIT) {
+ for (sk = tcp_ehash[i+tcp_ehash_size].chain;
+ sk != NULL;
+ sk = sk->next, num++) {
+ if (num < s_num)
+ continue;
+ if (!(r->tcpdiag_states&(1<<sk->zapped)))
+ continue;
+ if (r->id.tcpdiag_sport != sk->sport && r->id.tcpdiag_sport)
+ continue;
+ if (r->id.tcpdiag_dport != sk->dport && r->id.tcpdiag_dport)
+ continue;
+ if (bc && !tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), sk))
+ continue;
+ if (tcpdiag_fill(skb, sk, r->tcpdiag_ext,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq) <= 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+ }
+ }
+ read_unlock_bh(&head->lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+ return skb->len;
+}
+
+static int tcpdiag_dump_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+
+static __inline__ int
+tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
+ goto err_inval;
+
+ if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
+ goto err_inval;
+
+ if (nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
+ struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
+ if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
+ rta->rta_len < 8 ||
+ rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
+ goto err_inval;
+ if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+ goto err_inval;
+ }
+ return netlink_dump_start(tcpnl, skb, nlh,
+ tcpdiag_dump,
+ tcpdiag_dump_done);
+ } else {
+ return tcpdiag_get_exact(skb, nlh);
+ }
+
+err_inval:
+ return -EINVAL;
+}
+
+
+extern __inline__ void tcpdiag_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr * nlh;
+
+ if (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+ err = tcpdiag_rcv_msg(skb, nlh);
+ if (err)
+ netlink_ack(skb, nlh, err);
+ }
+}
+
+static void tcpdiag_rcv(struct sock *sk, int len)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
+ tcpdiag_rcv_skb(skb);
+ kfree_skb(skb);
+ }
+}
+
+static int __init tcpdiag_init(void)
+{
+ tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
+ if (tcpnl == NULL)
+ return -EBUSY;
+ return 0;
+}
+
+static void __exit tcpdiag_exit(void)
+{
+ printk(KERN_INFO "Caution: unloading tcp_diag is not very well supported. Nothing to worry, but yet.\n");
+ if (tcpnl)
+ sock_release(tcpnl->socket);
+}
+
+module_init(tcpdiag_init);
+module_exit(tcpdiag_exit);
+
+/*
+ * Local variables:
+ * compile-command: "gcc -DMOPS -DMODULE -D__KERNEL__ -I../include -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -c tcp_diag.c"
+ * End:
+ */
diff --git a/Patches/Catalogue b/Patches/Catalogue
index e69de29b..8e192791 100644
--- a/Patches/Catalogue
+++ b/Patches/Catalogue
@@ -0,0 +1,46 @@
+File: rt_cache_stat.dif
+Apply to: kernel < 2.4.7
+Status: recommended for kernels < 2.4.7.
+ already present in >= 2.4.7
+Description: tracing efficiency of routing cache
+Side effects: none
+
+File: pidentd-3.0.12.dif
+Apply to: pident-3.0.12 tree f.e. from am redhat rpm
+Status: highly recommended
+Description: Patch to pidentd allowing to use tcpdiag facility and fixing
+ some bugs in original pident.
+Side effects: none. Does not break anything not depending on kernel version,
+ even if tcpdiag is absent.
+Advice: not related to this patch but should be said yet.
+ Do NOT configure pidentd to use threads! Use option
+ "--without-threads" when doing "configure".
+ pidentd is typical example of application where
+ threading results in nothing but collapse of performance.
+ Apparently author learned thread programming and decided
+ to apply new knowledge to the first victim.
+
+File: symbol_exports.dif
+Apply to: kernel < 2.4.17
+Status: desired for kernels < 2.4.17
+ not needed for kernels >= 2.4.17
+Description: exports symbols required to load tcpdiag module
+ tcpdiag is builtin since 2.4.17, hence the exports
+ are redundant.
+Side effects: none
+
+File: af_unix.dif
+Apply to: kernel
+Status: recommended
+Desciption: implements fragmented skb for unix sockets reducing
+ vm pressure for datagram sockets and adds to /proc/net/unix
+ columns allowing to monitor recv/send memory and identify
+ peer of connected sockets.
+Side effects: "lsof" blames something about unix sockets.
+ Not a big loss, lsof is not able to tell anything more
+ clever than "can't identify protocol" for sockets anyway.
+Note: the patch affects area where one or two lines changed
+ several times while 2.4. It does not depend on this,
+ but unfortunately may reject. It apply cleanly to
+ 2.4.17.
+
diff --git a/Patches/af_unix.dif b/Patches/af_unix.dif
index e69de29b..0e48a172 100644
--- a/Patches/af_unix.dif
+++ b/Patches/af_unix.dif
@@ -0,0 +1,401 @@
+diff -ur ../vger3-011229/linux/net/unix/af_unix.c linux/net/unix/af_unix.c
+--- ../vger3-011229/linux/net/unix/af_unix.c Mon Dec 3 20:24:03 2001
++++ linux/net/unix/af_unix.c Sat Jan 5 04:30:19 2002
+@@ -112,6 +112,7 @@
+ #include <asm/checksum.h>
+
+ int sysctl_unix_max_dgram_qlen = 10;
++int sysctl_unix_stream_pages = MAX_SKB_FRAGS;
+
+ unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
+ rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
+@@ -1123,9 +1124,6 @@
+ struct scm_cookie scm;
+ memset(&scm, 0, sizeof(scm));
+ unix_detach_fds(&scm, skb);
+-
+- /* Alas, it calls VFS */
+- /* So fscking what? fput() had been SMP-safe since the last Summer */
+ scm_destroy(&scm);
+ sock_wfree(skb);
+ }
+@@ -1140,6 +1138,67 @@
+ scm->fp = NULL;
+ }
+
++int datagram_copy_fromiovec(struct iovec *iov, struct sk_buff *skb, int size)
++{
++ struct sock *sk;
++ struct sk_buff **tail, *skb1;
++ int copy = min_t(int, size, skb_tailroom(skb));
++
++ if (memcpy_fromiovec(skb_put(skb, copy), iov, copy))
++ goto do_fault;
++
++ if ((size -= copy) == 0)
++ return 0;
++
++ sk = skb->sk;
++ skb1 = skb;
++ tail = &skb_shinfo(skb)->frag_list;
++
++ do {
++ struct page *page;
++ int i = skb_shinfo(skb1)->nr_frags;
++
++ if (i == MAX_SKB_FRAGS) {
++ skb1 = alloc_skb(0, sk->allocation);
++ if (skb1 == NULL)
++ goto do_oom;
++ *tail = skb1;
++ tail = &skb1->next;
++ i = 0;
++ skb->truesize += skb1->truesize;
++ atomic_add(skb1->truesize, &sk->wmem_alloc);
++ }
++
++ page = alloc_pages(sk->allocation, 0);
++ if (page == NULL)
++ goto do_oom;
++
++ copy = min_t(int, size, PAGE_SIZE);
++ skb_shinfo(skb1)->nr_frags=i+1;
++ skb_shinfo(skb1)->frags[i].page = page;
++ skb_shinfo(skb1)->frags[i].page_offset = 0;
++ skb_shinfo(skb1)->frags[i].size = copy;
++
++ skb1->len += copy;
++ skb1->data_len += copy;
++ if (skb != skb1) {
++ skb->len += copy;
++ skb->data_len += copy;
++ }
++ skb->truesize += PAGE_SIZE;
++ atomic_add(PAGE_SIZE, &sk->wmem_alloc);
++ if (memcpy_fromiovec(page_address(page), iov, copy))
++ goto do_fault;
++ } while ((size -= copy) > 0);
++ return 0;
++
++do_oom:
++ return -ENOMEM;
++
++do_fault:
++ return -EFAULT;
++}
++
+ /*
+ * Send AF_UNIX data.
+ */
+@@ -1155,6 +1214,7 @@
+ unsigned hash;
+ struct sk_buff *skb;
+ long timeo;
++ int alloc;
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+@@ -1178,10 +1238,14 @@
+ goto out;
+
+ err = -EMSGSIZE;
+- if ((unsigned)len > sk->sndbuf - 32)
++ if ((unsigned)len > sk->sndbuf)
+ goto out;
+
+- skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
++ alloc = len;
++ if (alloc > SKB_MAX_HEAD(0))
++ alloc = SKB_MAX_HEAD(0);
++
++ skb = sock_alloc_send_skb(sk, alloc, msg->msg_flags&MSG_DONTWAIT, &err);
+ if (skb==NULL)
+ goto out;
+
+@@ -1190,7 +1254,7 @@
+ unix_attach_fds(scm, skb);
+
+ skb->h.raw = skb->data;
+- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
++ err = datagram_copy_fromiovec(msg->msg_iov, skb, len);
+ if (err)
+ goto out_free;
+
+@@ -1275,74 +1339,57 @@
+ return err;
+ }
+
+-
+ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
+ struct scm_cookie *scm)
+ {
+ struct sock *sk = sock->sk;
+ unix_socket *other = NULL;
+- struct sockaddr_un *sunaddr=msg->msg_name;
+- int err,size;
+ struct sk_buff *skb;
++ int err;
+ int sent=0;
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+ goto out_err;
+
+- if (msg->msg_namelen) {
+- err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP);
++ err = -ENOTCONN;
++ other = unix_peer_get(sk);
++ if (!other)
+ goto out_err;
+- } else {
+- sunaddr = NULL;
+- err = -ENOTCONN;
+- other = unix_peer_get(sk);
+- if (!other)
+- goto out_err;
+- }
+
+ if (sk->shutdown&SEND_SHUTDOWN)
+ goto pipe_err;
+
+- while(sent < len)
+- {
+- /*
+- * Optimisation for the fact that under 0.01% of X messages typically
+- * need breaking up.
+- */
++ while(sent < len) {
++ int size, alloc;
+
+- size=len-sent;
++ size = len-sent;
+
+ /* Keep two messages in the pipe so it schedules better */
+- if (size > sk->sndbuf/2 - 64)
+- size = sk->sndbuf/2 - 64;
++ if (size > sk->sndbuf/2)
++ size = sk->sndbuf/2;
+
+- if (size > SKB_MAX_ALLOC)
+- size = SKB_MAX_ALLOC;
+-
+ /*
+ * Grab a buffer
+ */
+-
+- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
++ alloc = size;
++
++ if (size > SKB_MAX_HEAD(0)) {
++ alloc = SKB_MAX_HEAD(0);
++ if (size > alloc + sysctl_unix_stream_pages*PAGE_SIZE)
++ size = alloc + sysctl_unix_stream_pages*PAGE_SIZE;
++ }
++
++ skb=sock_alloc_send_skb(sk,alloc,msg->msg_flags&MSG_DONTWAIT, &err);
+
+ if (skb==NULL)
+ goto out_err;
+
+- /*
+- * If you pass two values to the sock_alloc_send_skb
+- * it tries to grab the large buffer with GFP_NOFS
+- * (which can fail easily), and if it fails grab the
+- * fallback size buffer which is under a page and will
+- * succeed. [Alan]
+- */
+- size = min_t(int, size, skb_tailroom(skb));
+-
+ memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred));
+ if (scm->fp)
+ unix_attach_fds(scm, skb);
+
+- if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
++ if ((err = datagram_copy_fromiovec(msg->msg_iov, skb, size)) != 0) {
+ kfree_skb(skb);
+ goto out_err;
+ }
+@@ -1418,13 +1465,10 @@
+
+ scm->creds = *UNIXCREDS(skb);
+
+- if (!(flags & MSG_PEEK))
+- {
++ if (!(flags & MSG_PEEK)) {
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(scm, skb);
+- }
+- else
+- {
++ } else {
+ /* It is questionable: on PEEK we could:
+ - do not return fds - good, but too simple 8)
+ - return fds, and do not return them on read (old strategy,
+@@ -1483,13 +1527,10 @@
+ return timeo;
+ }
+
+-
+-
+ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size,
+ int flags, struct scm_cookie *scm)
+ {
+ struct sock *sk = sock->sk;
+- struct sockaddr_un *sunaddr=msg->msg_name;
+ int copied = 0;
+ int check_creds = 0;
+ int target;
+@@ -1515,21 +1556,18 @@
+
+ down(&sk->protinfo.af_unix.readsem);
+
+- do
+- {
++ do {
+ int chunk;
+ struct sk_buff *skb;
+
+ skb=skb_dequeue(&sk->receive_queue);
+- if (skb==NULL)
+- {
++ if (skb==NULL) {
+ if (copied >= target)
+ break;
+
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+-
+ if ((err = sock_error(sk)) != 0)
+ break;
+ if (sk->shutdown & RCV_SHUTDOWN)
+@@ -1551,60 +1589,44 @@
+
+ if (check_creds) {
+ /* Never glue messages from different writers */
+- if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) {
+- skb_queue_head(&sk->receive_queue, skb);
+- break;
+- }
++ if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0)
++ goto out_put_back;
+ } else {
+ /* Copy credentials */
+ scm->creds = *UNIXCREDS(skb);
+ check_creds = 1;
+ }
+
+- /* Copy address just once */
+- if (sunaddr)
+- {
+- unix_copy_addr(msg, skb->sk);
+- sunaddr = NULL;
+- }
++ chunk = min_t(int, skb->len - sk->protinfo.af_unix.copied, size);
++ err = skb_copy_datagram_iovec(skb, sk->protinfo.af_unix.copied, msg->msg_iov, chunk);
++ if (err)
++ goto out_put_back;
+
+- chunk = min_t(unsigned int, skb->len, size);
+- if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+- skb_queue_head(&sk->receive_queue, skb);
+- if (copied == 0)
+- copied = -EFAULT;
+- break;
+- }
+ copied += chunk;
+ size -= chunk;
+
+ /* Mark read part of skb as used */
+- if (!(flags & MSG_PEEK))
+- {
+- skb_pull(skb, chunk);
+-
++ if (!(flags & MSG_PEEK)) {
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(scm, skb);
+
+ /* put the skb back if we didn't use it up.. */
+- if (skb->len)
+- {
+- skb_queue_head(&sk->receive_queue, skb);
+- break;
+- }
++ if ((sk->protinfo.af_unix.copied += chunk) < skb->len)
++ goto out_put_back;
++
++ sk->protinfo.af_unix.copied = 0;
+
+ kfree_skb(skb);
+
+ if (scm->fp)
+ break;
+- }
+- else
+- {
++ } else {
+ /* It is questionable, see note in unix_dgram_recvmsg.
+ */
+ if (UNIXCB(skb).fp)
+ scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+
++out_put_back:
+ /* put message back and return */
+ skb_queue_head(&sk->receive_queue, skb);
+ break;
+@@ -1676,10 +1698,12 @@
+ break;
+ }
+
++ down(&sk->protinfo.af_unix.readsem);
+ spin_lock(&sk->receive_queue.lock);
+ if((skb=skb_peek(&sk->receive_queue))!=NULL)
+- amount=skb->len;
++ amount=skb->len - sk->protinfo.af_unix.copied;
+ spin_unlock(&sk->receive_queue.lock);
++ up(&sk->protinfo.af_unix.readsem);
+ err = put_user(amount, (int *)arg);
+ break;
+ }
+@@ -1734,7 +1758,7 @@
+ int i;
+ unix_socket *s;
+
+- len+= sprintf(buffer,"Num RefCount Protocol Flags Type St "
++ len+= sprintf(buffer,"Peer RcvQueue WMem Flags Type St "
+ "Inode Path\n");
+
+ read_lock(&unix_table_lock);
+@@ -1742,10 +1766,10 @@
+ {
+ unix_state_rlock(s);
+
+- len+=sprintf(buffer+len,"%p: %08X %08X %08X %04X %02X %5ld",
+- s,
+- atomic_read(&s->refcnt),
+- 0,
++ len+=sprintf(buffer+len,"%08lX: %08X %08X %08X %04X %02X %5ld",
++ unix_peer(s) ? sock_i_ino(unix_peer(s)) : 0,
++ skb_queue_len(&s->receive_queue),
++ atomic_read(&s->wmem_alloc),
+ s->state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
+ s->type,
+ s->socket ?
+diff -ur ../vger3-011229/linux/net/unix/sysctl_net_unix.c linux/net/unix/sysctl_net_unix.c
+--- ../vger3-011229/linux/net/unix/sysctl_net_unix.c Tue Jan 30 21:20:16 2001
++++ linux/net/unix/sysctl_net_unix.c Sat Jan 5 04:10:58 2002
+@@ -13,10 +13,14 @@
+ #include <linux/sysctl.h>
+
+ extern int sysctl_unix_max_dgram_qlen;
++extern int sysctl_unix_stream_pages;
+
+ ctl_table unix_table[] = {
+ {NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen",
+ &sysctl_unix_max_dgram_qlen, sizeof(int), 0600, NULL,
++ &proc_dointvec },
++ {NET_UNIX_STREAM_PAGES, "stream_pages",
++ &sysctl_unix_stream_pages, sizeof(int), 0600, NULL,
+ &proc_dointvec },
+ {0}
+ };
diff --git a/Patches/pidentd-3.0.12.dif b/Patches/pidentd-3.0.12.dif
index e69de29b..6e54e936 100644
--- a/Patches/pidentd-3.0.12.dif
+++ b/Patches/pidentd-3.0.12.dif
@@ -0,0 +1,270 @@
+diff -ur ../pidentd-3.0.12-orig/src/k_linux.c ./src/k_linux.c
+--- ../pidentd-3.0.12-orig/src/k_linux.c Sat Jan 12 00:44:05 2002
++++ ./src/k_linux.c Sat Nov 3 07:51:28 2001
+@@ -26,12 +26,65 @@
+
+ #include "pidentd.h"
+
++#define NETLINK_TCPDIAG 4
++#define TCPDIAG_GETSOCK 18
++
++#include <linux/uio.h>
++#include <linux/netlink.h>
++
++/* Socket identity */
++struct tcpdiag_sockid
++{
++ __u16 tcpdiag_sport;
++ __u16 tcpdiag_dport;
++ __u32 tcpdiag_src[4];
++ __u32 tcpdiag_dst[4];
++ __u32 tcpdiag_if;
++ __u32 tcpdiag_cookie[2];
++#define TCPDIAG_NOCOOKIE (~0U)
++};
++
++/* Request structure */
++
++struct tcpdiagreq
++{
++ __u8 tcpdiag_family; /* Family of addresses. */
++ __u8 tcpdiag_src_len;
++ __u8 tcpdiag_dst_len;
++ __u8 tcpdiag_ext; /* Query extended information */
++
++ struct tcpdiag_sockid id;
++
++ __u32 tcpdiag_states; /* States to dump */
++ __u32 tcpdiag_dbs; /* Tables to dump (NI) */
++};
++
++struct tcpdiagmsg
++{
++ __u8 tcpdiag_family;
++ __u8 tcpdiag_state;
++ __u8 tcpdiag_timer;
++ __u8 tcpdiag_retrans;
++
++ struct tcpdiag_sockid id;
++
++ __u32 tcpdiag_expires;
++ __u32 tcpdiag_rqueue;
++ __u32 tcpdiag_wqueue;
++ __u32 tcpdiag_uid;
++ __u32 tcpdiag_inode;
++};
++
++
++int tcpdiag_fd = -1;
++
+ /*
+ ** Make sure we are running on a supported OS version
+ */
+ int
+ ka_init(void)
+ {
++ tcpdiag_fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_TCPDIAG);
+ return 0; /* We always succeed */
+ }
+
+@@ -56,6 +109,144 @@
+ }
+
+
++
++int k_lookup_tcpdiag(struct kernel *kp)
++{
++ struct sockaddr_nl nladdr;
++ struct {
++ struct nlmsghdr nlh;
++ struct tcpdiagreq r;
++ } req;
++ struct msghdr msg;
++ char buf[8192];
++ struct iovec iov[1];
++ struct tcpdiagmsg *r;
++ static unsigned seqno = 123456;
++
++ memset(&nladdr, 0, sizeof(nladdr));
++ nladdr.nl_family = AF_NETLINK;
++
++ req.nlh.nlmsg_len = sizeof(req);
++ req.nlh.nlmsg_type = TCPDIAG_GETSOCK;
++ req.nlh.nlmsg_flags = NLM_F_REQUEST;
++ req.nlh.nlmsg_pid = 0;
++ req.nlh.nlmsg_seq = ++seqno;
++ memset(&req.r, 0, sizeof(req.r));
++ req.r.tcpdiag_family = AF_INET;
++ req.r.tcpdiag_states = ~0;
++
++ req.r.id.tcpdiag_dport = kp->remote.sin_port;
++ req.r.id.tcpdiag_sport = kp->local.sin_port;
++ req.r.id.tcpdiag_dst[0] = kp->remote.sin_addr.s_addr;
++ req.r.id.tcpdiag_src[0] = kp->local.sin_addr.s_addr;
++ req.r.id.tcpdiag_cookie[0] = TCPDIAG_NOCOOKIE;
++ req.r.id.tcpdiag_cookie[1] = TCPDIAG_NOCOOKIE;
++ kp->ruid = NO_UID;
++
++ iov[0] = (struct iovec){ &req, sizeof(req) };
++
++ msg = (struct msghdr) {
++ (void*)&nladdr, sizeof(nladdr),
++ iov, 1,
++ NULL, 0,
++ 0
++ };
++
++ if (sendmsg(tcpdiag_fd, &msg, 0) < 0) {
++ if (errno == ECONNREFUSED) {
++ close(tcpdiag_fd);
++ tcpdiag_fd = -1;
++ return 0;
++ }
++ syslog(LOG_ERR, "system error on tcpdiag sendmsg: %m");
++ return -1;
++ }
++
++ iov[0] = (struct iovec){ buf, sizeof(buf) };
++
++ while (1) {
++ int status;
++ struct nlmsghdr *h;
++
++ msg = (struct msghdr) {
++ (void*)&nladdr, sizeof(nladdr),
++ iov, 1,
++ NULL, 0,
++ 0
++ };
++
++ status = recvmsg(tcpdiag_fd, &msg, 0);
++
++ if (status < 0) {
++ if (errno == EINTR || errno == EAGAIN)
++ continue;
++ return -1;
++ }
++ if (status == 0) {
++ return -1;
++ }
++
++ h = (struct nlmsghdr*)buf;
++ while (NLMSG_OK(h, status)) {
++ int err;
++
++ if (/*h->nlmsg_pid != rth->local.nl_pid ||*/
++ h->nlmsg_seq != seqno)
++ goto skip_it;
++
++ if (h->nlmsg_type == NLMSG_DONE)
++ return -1;
++ if (h->nlmsg_type == NLMSG_ERROR) {
++ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
++ if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
++ return -1;
++ } else {
++ errno = -err->error;
++ if (errno == ECONNREFUSED) {
++ close(tcpdiag_fd);
++ tcpdiag_fd = -1;
++ return 0;
++ }
++ if (errno != ENOENT)
++ syslog(LOG_ERR, "tcpdiag answers: %m");
++ }
++ return -1;
++ }
++
++ r = NLMSG_DATA(h);
++
++ /* Lookup _may_ return listening socket, if no
++ * better matches are found. */
++ if (r->id.tcpdiag_dport == kp->remote.sin_port &&
++ r->id.tcpdiag_dst[0] == kp->remote.sin_addr.s_addr) {
++ kp->ruid = r->tcpdiag_uid;
++ if (!r->tcpdiag_inode && !r->tcpdiag_uid) {
++ /* _NEVER_ return "root" for closed
++ * sockets. Otherwise people think
++ * that it is sysadmin who abuses their
++ * poor ircd. :-) */
++ syslog(LOG_NOTICE,
++ "Req for stale socket(%d) %d from %x/%d",
++ r->tcpdiag_state, ntohs(r->id.tcpdiag_sport),
++ r->id.tcpdiag_dst[0], ntohs(r->id.tcpdiag_dport));
++ return -1;
++ }
++ return 1;
++ }
++
++ return -1;
++
++skip_it:
++ h = NLMSG_NEXT(h, status);
++ }
++ if ((msg.msg_flags & MSG_TRUNC) || status) {
++ syslog(LOG_ERR, "truncated tcp_diag message");
++ return -1;
++ }
++ }
++}
++
++
+ int
+ ka_lookup(void *vp, struct kernel *kp)
+ {
+@@ -64,16 +255,23 @@
+ long r_laddr, r_raddr, myladdr, myraddr;
+ int r_lport, r_rport, mylport, myrport;
+ int euid;
+-
+-
++
++ if (tcpdiag_fd >= 0) {
++ int res;
++ if ((res = k_lookup_tcpdiag(kp)) != 0)
++ return res;
++ syslog(LOG_ERR, "tcp_diag is not loaded, fallback to proc");
++ }
++
++
+ r_rport = ntohs(kp->remote.sin_port);
+ r_lport = ntohs(kp->local.sin_port);
+ r_raddr = kp->remote.sin_addr.s_addr;
+ r_laddr = kp->local.sin_addr.s_addr;
++ kp->ruid = NO_UID;
+
+ fp = (FILE *) vp;
+
+- kp->ruid = NO_UID;
+ rewind(fp);
+
+ /* eat header */
+@@ -82,13 +280,26 @@
+
+ while (fgets(buf, sizeof(buf)-1, fp) != NULL)
+ {
+- if (sscanf(buf, "%*d: %lx:%x %lx:%x %*x %*x:%*x %*x:%*x %*x %d %*d %*d",
+- &myladdr, &mylport, &myraddr, &myrport, &euid) == 5)
++ int state, ino;
++ if (sscanf(buf, "%*d: %x:%x %x:%x %x %*x:%*x %*x:%*x %*x %d %*d %u",
++ &myladdr, &mylport, &myraddr, &myrport,
++ &state, &euid, &ino) == 7)
+ {
+ if (myladdr == r_laddr && mylport == r_lport &&
+ myraddr == r_raddr && myrport == r_rport)
+ {
+ kp->euid = euid;
++ if (ino == 0 && euid == 0)
++ {
++ /* _NEVER_ return "root" for closed
++ * sockets. Otherwise people think
++ * that it is sysadmin who abuses their
++ * poor ircd. :-) */
++ syslog(LOG_NOTICE,
++ "Req for stale socket(%d) %d from %x/%d",
++ state, r_rport, r_raddr, r_lport);
++ return -1;
++ }
+ return 1;
+ }
+ }
diff --git a/Patches/rt_cache_stat.dif b/Patches/rt_cache_stat.dif
index e69de29b..a03ddf22 100644
--- a/Patches/rt_cache_stat.dif
+++ b/Patches/rt_cache_stat.dif
@@ -0,0 +1,230 @@
+--- linux/include/net/route.h.orig Tue Apr 17 07:25:48 2001
++++ linux/include/net/route.h Tue Jul 10 23:35:18 2001
+@@ -14,6 +14,7 @@
+ * Alan Cox : Support for TCP parameters.
+ * Alexey Kuznetsov: Major changes for new routing code.
+ * Mike McLagan : Routing by source
++ * Robert Olsson : Added rt_cache statistics
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+@@ -90,6 +91,20 @@
+ __u32 o_packets;
+ __u32 i_bytes;
+ __u32 i_packets;
++};
++
++struct rt_cache_stat
++{
++ unsigned in_hit;
++ unsigned in_slow_tot;
++ unsigned in_slow_mc;
++ unsigned in_no_route;
++ unsigned in_brd;
++ unsigned in_martian_dst;
++ unsigned in_martian_src;
++ unsigned out_hit;
++ unsigned out_slow_tot;
++ unsigned out_slow_mc;
+ };
+
+ extern struct ip_rt_acct *ip_rt_acct;
+--- linux/net/ipv4/route.c.orig Wed Mar 28 22:01:15 2001
++++ linux/net/ipv4/route.c Tue Jul 10 23:27:51 2001
+@@ -52,6 +52,7 @@
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
++ * Robert Olsson : Added rt_cache statistics
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+@@ -201,6 +202,8 @@
+ static unsigned rt_hash_mask;
+ static int rt_hash_log;
+
++struct rt_cache_stat rt_cache_stat[NR_CPUS];
++
+ static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
+
+ static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+@@ -270,6 +273,44 @@
+ len = length;
+ return len;
+ }
++
++
++#ifdef CONFIG_PROC_FS
++static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
++{
++ int i, lcpu;
++ int len=0;
++ unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
++
++ for (lcpu=0; lcpu<smp_num_cpus; lcpu++) {
++ i = cpu_logical_map(lcpu);
++
++ len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
++ dst_entries,
++ rt_cache_stat[i].in_hit,
++ rt_cache_stat[i].in_slow_tot,
++ rt_cache_stat[i].in_slow_mc,
++ rt_cache_stat[i].in_no_route,
++ rt_cache_stat[i].in_brd,
++ rt_cache_stat[i].in_martian_dst,
++ rt_cache_stat[i].in_martian_src,
++
++ rt_cache_stat[i].out_hit,
++ rt_cache_stat[i].out_slow_tot,
++ rt_cache_stat[i].out_slow_mc
++ );
++ }
++ len -= offset;
++
++ if (len > length)
++ len = length;
++ if (len < 0)
++ len = 0;
++
++ *start = buffer + offset;
++ return len;
++}
++#endif
+
+ static __inline__ void rt_free(struct rtable *rt)
+ {
+@@ -1163,6 +1204,8 @@
+ u32 spec_dst;
+ struct in_device *in_dev = in_dev_get(dev);
+ u32 itag = 0;
++ int cpu = smp_processor_id();
++
+
+ /* Primary sanity checks. */
+
+@@ -1221,6 +1264,7 @@
+ if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+ rth->u.dst.input = ip_mr_input;
+ #endif
++ rt_cache_stat[cpu].in_slow_mc++;
+
+ in_dev_put(in_dev);
+ hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
+@@ -1259,6 +1303,7 @@
+ u32 spec_dst;
+ int err = -EINVAL;
+ int free_res = 0;
++ int cpu = smp_processor_id();
+
+ /*
+ * IP on this device is disabled.
+@@ -1308,6 +1353,8 @@
+ }
+ free_res = 1;
+
++ rt_cache_stat[cpu].in_slow_tot++;
++
+ #ifdef CONFIG_IP_ROUTE_NAT
+ /* Policy is applied before mapping destination,
+ but rerouting after map should be made with old source.
+@@ -1455,6 +1502,7 @@
+ }
+ flags |= RTCF_BROADCAST;
+ res.type = RTN_BROADCAST;
++ rt_cache_stat[cpu].in_brd++;
+
+ local_input:
+ rth = dst_alloc(&ipv4_dst_ops);
+@@ -1498,6 +1546,7 @@
+ goto intern;
+
+ no_route:
++ rt_cache_stat[cpu].in_no_route++;
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ res.type = RTN_UNREACHABLE;
+ goto local_input;
+@@ -1506,6 +1555,7 @@
+ * Do not cache martian addresses: they should be logged (RFC1812)
+ */
+ martian_destination:
++ rt_cache_stat[cpu].in_martian_dst++;
+ #ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n",
+@@ -1520,6 +1570,8 @@
+ goto done;
+
+ martian_source:
++
++ rt_cache_stat[cpu].in_martian_src++;
+ #ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+ /*
+@@ -1550,6 +1602,7 @@
+ struct rtable * rth;
+ unsigned hash;
+ int iif = dev->ifindex;
++ int cpu = smp_processor_id();
+
+ tos &= IPTOS_RT_MASK;
+ hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
+@@ -1567,6 +1620,7 @@
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
++ rt_cache_stat[cpu].in_hit++;
+ read_unlock(&rt_hash_table[hash].lock);
+ skb->dst = (struct dst_entry*)rth;
+ return 0;
+@@ -1621,6 +1675,7 @@
+ int free_res = 0;
+ int err;
+ u32 tos;
++ int cpu = smp_processor_id();
+
+ tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
+ key.dst = oldkey->dst;
+@@ -1847,14 +1902,18 @@
+
+ rth->u.dst.output=ip_output;
+
++ rt_cache_stat[cpu].out_slow_tot++;
++
+ if (flags&RTCF_LOCAL) {
+ rth->u.dst.input = ip_local_deliver;
+ rth->rt_spec_dst = key.dst;
+ }
+ if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
+ rth->rt_spec_dst = key.src;
+- if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
++ if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) {
+ rth->u.dst.output = ip_mc_output;
++ rt_cache_stat[cpu].out_slow_mc++;
++ }
+ #ifdef CONFIG_IP_MROUTE
+ if (res.type == RTN_MULTICAST) {
+ struct in_device *in_dev = in_dev_get(dev_out);
+@@ -1894,6 +1953,7 @@
+ {
+ unsigned hash;
+ struct rtable *rth;
++ int cpu = smp_processor_id();
+
+ hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
+
+@@ -1912,6 +1972,7 @@
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
++ rt_cache_stat[cpu].out_hit++;
+ read_unlock_bh(&rt_hash_table[hash].lock);
+ *rp = rth;
+ return 0;
+@@ -2339,6 +2400,7 @@
+ add_timer(&rt_periodic_timer);
+
+ proc_net_create ("rt_cache", 0, rt_cache_get_info);
++ proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info);
+ #ifdef CONFIG_NET_CLS_ROUTE
+ create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
+ #endif
diff --git a/Patches/symbol_exports.dif b/Patches/symbol_exports.dif
index e69de29b..519ea7cd 100644
--- a/Patches/symbol_exports.dif
+++ b/Patches/symbol_exports.dif
@@ -0,0 +1,56 @@
+diff -ur ../vger3-010830/linux/net/ipv6/tcp_ipv6.c linux/net/ipv6/tcp_ipv6.c
+--- ../vger3-010830/linux/net/ipv6/tcp_ipv6.c Wed Jun 13 21:14:05 2001
++++ linux/net/ipv6/tcp_ipv6.c Fri Oct 12 06:59:07 2001
+@@ -339,13 +339,18 @@
+ return tcp_v6_lookup_listener(daddr, hnum, dif);
+ }
+
+-#define tcp_v6_lookup(sa, sp, da, dp, dif) \
+-({ struct sock *___sk; \
+- local_bh_disable(); \
+- ___sk = __tcp_v6_lookup((sa),(sp),(da),ntohs(dp),(dif)); \
+- local_bh_enable(); \
+- ___sk; \
+-})
++__inline__ struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
++ struct in6_addr *daddr, u16 dport,
++ int dif)
++{
++ struct sock *sk;
++
++ local_bh_disable();
++ sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
++ local_bh_enable();
++
++ return sk;
++}
+
+
+ /*
+diff -ur ../vger3-010830/linux/net/netsyms.c linux/net/netsyms.c
+--- ../vger3-010830/linux/net/netsyms.c Sun Aug 19 22:01:45 2001
++++ linux/net/netsyms.c Fri Oct 12 07:59:17 2001
+@@ -72,6 +72,11 @@
+
+ extern int netdev_finish_unregister(struct net_device *dev);
+
++extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
++ struct in6_addr *daddr, u16 dport,
++ int dif);
++extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
++
+ #include <linux/rtnetlink.h>
+
+ #ifdef CONFIG_IPX_MODULE
+@@ -284,7 +289,11 @@
+ EXPORT_SYMBOL(ndisc_mc_map);
+ EXPORT_SYMBOL(register_inet6addr_notifier);
+ EXPORT_SYMBOL(unregister_inet6addr_notifier);
++EXPORT_SYMBOL(tcp_v6_lookup);
+ #endif
++EXPORT_SYMBOL(tcp_v4_lookup);
++EXPORT_SYMBOL(tcp_timewait_cachep);
++EXPORT_SYMBOL(tcp_hashinfo);
+ #if defined (CONFIG_IPV6_MODULE) || defined (CONFIG_KHTTPD) || defined (CONFIG_KHTTPD_MODULE)
+ /* inet functions common to v4 and v6 */
+ EXPORT_SYMBOL(inet_release);
diff --git a/README b/README
index e69de29b..53a45c2e 100644
--- a/README
+++ b/README
@@ -0,0 +1,65 @@
+Primary FTP site is:
+
+ ftp://ftp.inr.ac.ru/ip-routing/
+
+Mirrors are:
+
+ ftp://linux.wauug.org/pub/net
+ ftp://ftp.nc.ras.ru/pub/mirrors/ftp.inr.ac.ru/ip-routing/
+ ftp://ftp.gts.cz/MIRRORS/ftp.inr.ac.ru/
+ ftp://ftp.funet.fi/pub/mirrors/ftp.inr.ac.ru/ip-routing/ (STM1 to USA)
+ ftp://sunsite.icm.edu.pl/pub/Linux/iproute/
+ ftp://ftp.sunet.se/pub/Linux/ip-routing/
+ ftp://ftp.nvg.ntnu.no/pub/linux/ip-routing/
+ ftp://ftp.crc.ca/pub/systems/linux/ip-routing/
+ ftp://ftp.proxad.net/mirrors/ftp.inr.ac.ru/ip-routing/
+ ftp://donlug.dn.ua/pub/mirrors/ip-routing/
+ ftp://omni.rk.tusur.ru/mirrors/ftp.inr.ac.ru/ip-routing/
+ ftp://ftp.src.uchicago.edu/pub/linux/ip-routing/
+ http://www.asit.ro/ip-routing/
+ ftp://ftp.infoscience.co.jp/pub/linux/ip-routing/ (Japan)
+ ftp://ftp.sucs.swan.ac.uk/pub/mirrors/ftp.inr.ac.ru/ip-routing
+ http://mirror.schell.de/ftp.inr.ac.ru/ip-routing/ (Germany)
+ ftp://ftp.gin.cz/MIRRORS/ftp.inr.ac.ru/ip-routing
+ ftp://mirror.aarnet.edu.au/pub/ip-routing/ (Australia)
+ http://mirror.aarnet.edu.au/pub/ip-routing/ (Australia)
+
+RPMs are available at:
+ ftp://omni.rk.tusur.ru/Tango/
+ ftp://ftp4.dgtu.donetsk.ua/pub/BlackCat/6.0/contrib/SRPMS/i[35]86/
+
+
+
+How to compile this.
+--------------------
+
+
+1. Look at start of Makefile and set correct values for:
+
+KERNEL_INCLUDE should point to correct linux kernel include directory.
+Default (/usr/src/linux/include) is right as rule.
+
+ADDLIB should contain inet_* functions, if your libc contains
+obsolete resolver library (<4.9.4) and you have no correct libresolv.
+ADDLIB should also contain dnet_* functions if you don't have a
+libdnet with support for them. If your libdnet does have support,
+then comment out that line and uncomment the line to add -ldnet to
+LDLIBS.
+
+LDLIBS should be empty, if you have no libresolv.
+
+
+2. make
+
+Utilities "ip" and "rtmon" are in ip/ directory now,
+"tc" is in tc/. That's all.
+
+3. To make documentation, cd to doc/ directory , then
+ look at start of Makefile and set correct values for
+ PAGESIZE=a4 , ie: a4 , letter ... (string)
+ PAGESPERPAGE=2 , ie: 1 , 2 ... (numeric)
+ and make there. It assumes, that latex, dvips and psnup
+ are in your path.
+
+Alexey Kuznetsov
+kuznet@ms2.inr.ac.ru
diff --git a/README.decnet b/README.decnet
index e69de29b..4d7453aa 100644
--- a/README.decnet
+++ b/README.decnet
@@ -0,0 +1,41 @@
+
+Here are a few quick points about DECnet support...
+
+ o No name resolution is available as yet, all addresses must be
+ entered numerically.
+
+ o The neighbour cache may well list every entry as having the address
+ 0.170. This is due to a problem that I need to sort out kernel side.
+ It is harmless (but don't try and use neigh add yet) just look in
+ /proc/net/decnet_neigh to see the real addresses for now.
+
+ o The rtnetlink support in the kernel is rather exprimental, expect a
+ few odd things to happen for the next few DECnet kernel releases.
+
+ o Whilst you can use ip addr add to add more than one DECnet address to an
+ interface, don't expect addresses which are not the same as the
+ kernels node address to work properly. i.e. You will break the DECnet
+ protocol if you do add anything other than the automatically generated
+ interface addresses to ethernet cards. This option is there for future
+ link layer support, where the device will have to be configed for
+ DECnet explicitly.
+
+ o The DECnet support is currently self contained. You do not need the
+ libdnet library to use it. In fact until I've sent the dnet_pton and
+ dnet_ntop functions to Patrick to add, you can't use libdnet.
+
+ o If you are not using the very latest 2.3.xx series kernels, don't
+ try and list DECnet routes if you've got IPv6 compiled into the
+ kernel. It will oops.
+
+ o My main reason for writing the DECnet support for iproute2 was to
+ check out the DECnet routing code, so the route get and
+ route show cache commands are likely to be the most debugged out of
+ all of them.
+
+ o If you find bugs in the DECnet support, please send them to me in the
+ first instance, and then I'll send Alexey a patch to fix it. IPv4/6
+ bugs should be sent to Alexey as before.
+
+Steve Whitehouse <SteveW@ACM.org>
+
diff --git a/README.iproute2+tc b/README.iproute2+tc
index e69de29b..edd79c0e 100644
--- a/README.iproute2+tc
+++ b/README.iproute2+tc
@@ -0,0 +1,119 @@
+iproute2+tc*
+
+It's the first release of Linux traffic control engine.
+
+
+NOTES.
+* csz scheduler is inoperational at the moment, and probably
+ never will be repaired but replaced with h-pfq scheduler.
+* To use "fw" classifier you will need ipfwchains patch.
+* No manual available. Ask me, if you have problems (only try to guess
+ answer yourself at first 8)).
+
+
+Micro-manual how to start it the first time
+-------------------------------------------
+
+A. Attach CBQ to eth1:
+
+tc qdisc add dev eth1 root handle 1: cbq bandwidth 10Mbit allot 1514 cell 8 \
+avpkt 1000 mpu 64
+
+B. Add root class:
+
+tc class add dev eth1 parent 1:0 classid 1:1 cbq bandwidth 10Mbit rate 10Mbit \
+allot 1514 cell 8 weight 1Mbit prio 8 maxburst 20 avpkt 1000
+
+C. Add default interactive class:
+
+tc class add dev eth1 parent 1:1 classid 1:2 cbq bandwidth 10Mbit rate 1Mbit \
+allot 1514 cell 8 weight 100Kbit prio 3 maxburst 20 avpkt 1000 split 1:0 \
+defmap c0
+
+D. Add default class:
+
+tc class add dev eth1 parent 1:1 classid 1:3 cbq bandwidth 10Mbit rate 8Mbit \
+allot 1514 cell 8 weight 800Kbit prio 7 maxburst 20 avpkt 1000 split 1:0 \
+defmap 3f
+
+etc. etc. etc. Well, it is enough to start 8) The rest can be guessed 8)
+Look also at more elaborated example, ready to start rsvpd,
+in rsvp/cbqinit.eth1.
+
+
+Terminology and advices about setting CBQ parameters may be found in Sally Floyd
+papers.
+
+
+Pairs X:Y are class handles, X:0 are qdisc heandles.
+weight should be proportional to rate for leaf classes
+(I choosed it ten times less, but it is not necessary)
+
+defmap is bitmap of logical priorities served by this class.
+
+E. Another qdiscs are simpler. F.e. let's join TBF on class 1:2
+
+tc qdisc add dev eth1 parent 1:2 tbf rate 64Kbit buffer 5Kb/8 limit 10Kb
+
+F. Look at all that we created:
+
+tc qdisc ls dev eth1
+tc class ls dev eth1
+
+G. Install "route" classifier on root of cbq and map destination from realm
+1 to class 1:2
+
+tc filter add dev eth1 parent 1:0 protocol ip prio 100 route to 1 classid 1:2
+
+H. Assign routes to 10.11.12.0/24 to realm 1
+
+ip route add 10.11.12.0/24 dev eth1 via whatever realm 1
+
+etc. The same thing can be made with rules.
+I still did not test ipchains, but they should work too.
+
+Setup of rsvp and u32 classifiers is more hairy.
+If you read RSVP specs, you will understand how rsvp classifier
+works easily. What's about u32... That's example:
+
+
+
+#! /bin/sh
+
+TC=/home/root/tc
+
+# Setup classifier root on eth1 root (it is cbq)
+$TC filter add dev eth1 parent 1:0 prio 5 protocol ip u32
+
+# Create hash table of 256 slots with ID 1:
+$TC filter add dev eth1 parent 1:0 prio 5 handle 1: u32 divisor 256
+
+# Add to 6th slot of hash table rule to select tcp/telnet to 193.233.7.75
+# direct it to class 1:4 and prescribe to fall to best effort,
+# if traffic violate TBF (32kbit,5K)
+$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:6: \
+ match ip dst 193.233.7.75 \
+ match tcp dst 0x17 0xffff \
+ flowid 1:4 \
+ police rate 32kbit buffer 5kb/8 mpu 64 mtu 1514 index 1
+
+# Add to 1th slot of hash table rule to select icmp to 193.233.7.75
+# direct it to class 1:4 and prescribe to fall to best effort,
+# if traffic violate TBF (10kbit,5K)
+$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:: \
+ sample ip protocol 1 0xff \
+ match ip dst 193.233.7.75 \
+ flowid 1:4 \
+ police rate 10kbit buffer 5kb/8 mpu 64 mtu 1514 index 2
+
+# Lookup hash table, if it is not fragmented frame
+# Use protocol as hash key
+$TC filter add dev eth1 parent 1:0 prio 5 handle ::1 u32 ht 800:: \
+ match ip nofrag \
+ offset mask 0x0F00 shift 6 \
+ hashkey mask 0x00ff0000 at 8 \
+ link 1:
+
+
+Alexey Kuznetsov
+kuznet@ms2.inr.ac.ru
diff --git a/RELNOTES b/RELNOTES
index e69de29b..17f00111 100644
--- a/RELNOTES
+++ b/RELNOTES
@@ -0,0 +1,168 @@
+[020116]
+! 1. Compile with rh-7.2
+! 2. What the hell some people blame on socklen_t defined in unistd.h? Check.
+ * Kim Woelders <kim@woelders.dk>, various useful fixups: compilation
+ with old kernels, cross-compiling, "all" == "any" in prefix spec.
+ * Collected from my disk, cleaned and packed to directory iproute2/misc/
+ several utilities: ss, nstat, ifstat, rtacct, arpd and module tcp_diag.
+ Writing some docs. me.
+ * prepared patchlet for pidentd to use tcp_diag.
+ * David Miller: 64bit (and even worse 64bit kernel/32 bit user :-) fixes
+ to above. tcp_diag is merged to main tree.
+ * Alexandr D. Kanevskiy <kad@blackcatlinux.com>: various flaws in ss
+ * Alexandr D. Kanevskiy <kad@blackcatlinux.com>: oops, more aggressive caching
+ of names opened old bugs: ip started to print garbage in some places.
+ * Robert Olsson, rt_cache_stat. Renamed to rtstat.
+ * An old bug in "ip maddr ls": reduntant empty lines in output.
+ Seeing this crap for ages but lucky match of desire/ability to repair
+ and a huff about this happened only today. :-)
+ * "Mr. James W. Laferriere" <babydr@baby-dragons.com>
+ doc: option to produce ps output for non-a4 and not only 2 pages/sheet.
+ * Jamal's patch for ingres qdisc.
+ * Bernd Eckenfels <ecki@lina.inka.de>: deleted orphaned bogus #include
+ in include/utils.h.
+ * Julian Anastasov <ja@ssi.bg>: uninitialized fields in nexthop
+ producing funny "dead" nexthops in multipath routes.
+ Stupid me, look at the first line in [010803]... Was it difficult to guess
+ this that time? People blame for several months. :-)
+ Special thanks to bert hubert <ahu@ds9a.nl> who raised the issue in netdev.
+ Thanks and apologies to Terry Schmidt <terry@nycwireless.net>,
+ Ruben Puettmann <ruben.puettmann@freenet-ag.de>,
+ Mark Ivens <mivens@clara.net>.
+ * willy tarreau <wtarreau@yahoo.fr>: "make install" target.
+ * Tunable limit for sch_sfq. Patch to kernel activating this
+ is about to be submitted. Reminded by Adi Nugroho <Adi@iNterNUX.co.id>.
+
+[010824]
+ * ip address add sets scope of loopback addreses to "host".
+ Advised by David Miller.
+ * ZIP! <zip@killerlabs.com> and David Ford <david@blue-labs.org>
+ Some strcpy's changed to strncpy's.
+ * David Ford <david@blue-labs.org>, test for compilation with gcc3.
+ * David Ford <david@blue-labs.org>. Damn, I broke rtnl_talk in previous
+ snapshot.
+
+[010803]
+ * If "dev" is not specified in multipath route, ifindex remained
+ uninitialized. Grr. Thanks to Kunihiro Ishiguro <kunihiro@zebra.org>.
+ * Rafal Maszkowski <rzm@icm.edu.pl>, batch mode tc. The most old patch.
+ * Updates list of data protocol ids.
+ Lots of reporters. I bring my apologies.
+ * Jan Rekorajski <baggins@sith.mimuw.edu.pl>. Updated list of datalink types.
+ * Christina Chen <chenchristina@cwc.nus.edu.sg>. Bug in parsing IPv6 address match in u32.
+ * Pekka Savola <pekkas@netcore.fi>. ip -6 route flush dev lo stuck
+ on deleting root of the table.
+ * Werner. dsmark fixes.
+ * Alexander Demenshin <aldem-reply@aldem.net>. Old miracleous bug
+ in ip monitor. It was puzzle, people permanently blame that
+ it prints some crap.
+ * Rui Prior <rprior@inescporto.pt>. f_route failed to resolve fromif.
+ Werner also noticed this and sent patch. Bad place... [RETHINK]
+ * Kim Woelders <kim@woelders.dk>.
+ - changes in Makefile for cross-compile
+ - understand "all" as alias for "any"
+ - bug in iprule.c
+! [ NB. Also he sent patch for kernel. Do not forget! ]
+ * Werner. Fix to tc core files: wrong exits etc.
+ * Bernd Jendrissek <berndj@prism.co.za>. Some sanitizations of tc.c
+!* Marian Jancar <marian.jancar@infonet.cz>. He say q_tbf prints wrong latency!
+! Seems, he is wrong.
+ * Werner (and Nikolai Vladychevski <niko@isl.net.mx>) check ->print_copts
+ to avoid segfault.
+
+[001007]
+ * Compiles under rh-7.0
+
+[000928]
+ * Sorry. I have lost all the CVS with changes made since 000305.
+ If someone sent me a patch after this date, please, resubmit.
+ Restored from the last backup and mailboxes:
+
+ * Edit ip-cref.tex by raf <raf2@zip.com.au>.
+ * RTAX_REORDERING support.
+ * IFLA_MASTER support.
+ * Bug in rtnl_talk(), libnetlink.c. Reported by David P. Olshfski
+ <olshef@us.ibm.com>
+
+[000305]
+ * Bugs in RESOLVE_HOSTNAMES. Bratislav Ilich <bilik@@zepter.ru>
+ * ARPHRD_IEEE802_TR
+
+[000225]
+ * ECN in q_red.c.
+
+[000221]
+ * diffserv update from Jamal Hadi Salim
+ * Some bits of IPX from Steve Whitehouse.
+ * ATM qdisc from Werner Almesberger
+ * Support for new attributes on routes in linux-2.3.
+
+[991023]
+ No news, only several bugs are fixed.
+ * Since ss990630 "ip rule list" printed wrong prefix length.
+ Vladimir V. Ivanov <vlad@alis.tusur.ru>
+ * "ip rule" parsed >INT_MAX values of metric incorrectly.
+ Matthew G. Marsh <mgm@paktronix.com>
+ * Some improvements in doc/Makefile advised by
+ Andi Kleen and Werner Almesberger.
+
+[990824]
+ * new attributes in "ip route": rtt, rttvar, cwnd, ssthresh and advmss.
+ * some updates in documentaion to reflect new status.
+
+[990630]
+ * DiffServ support.
+ Werner Almesberger <almesber@lrc.di.epfl.ch>
+ Jamal Hadi Salim <hadi@nortelnetworks.com>
+ * DECnet support.
+ Steve Whitehouse <SteveW@ACM.org>
+ * Some minor tweaks in docs and code.
+
+[990530]
+ * routel script. Stephen R. van den Berg <srb@cuci.nl>
+ * Bug in tc/q_prio.c resetting priomap. Reported by
+ Ole Husgaard <sparre@login.dknet.dk> and
+ Jan Kasprzak <kas@informatics.muni.cz>
+ * IP command reference manual is published (ip-cref.tex).
+ I am sorry, but tc-cref.tex is still not ready, to be more
+ exact the draft does not describe current tc 8-)
+ * ip, rtmon, rtacct utilities are updated according to manual 8-)
+ Lots of changes:
+ - (MAIN) "flush" command for addr, neigh and route.
+ - error messages are sanitized; now it does not print
+ usage() page on each error.
+ - output format is improved.
+ - "oneline" mode is added.
+ - etc.
+ * Name databases; resolution acsii <-> numeric is split out to lib/*
+ * scripts ifcfg, ifone and rtpr.
+ * examples/dhcp-client-script is copied from my patch to ISC dhcp.
+ * Makefile in doc/ directory.
+
+[990417]
+ * "pmtudisc" flag to "ip tunnel". Phil Karn <karn@ka9q.ampr.org>
+ * bug in tc/q_tbf.c preventing setting peak_rate, Martin Mares <mj@ucw.cz>
+ * doc/flowlabels.tex
+
+[990329]
+
+ * This snapshot fixes some compatibility problems, which I introduced
+ occasionally to previous snapshots.
+ * Namely, "allot" to "tc qdisc add ... cbq" is accepted but ignored.
+ * Another changes are supposed to be shown in the next snapshot, but
+ because of troubles with "allot" I am forced to release premature
+ version. Namely, "cell", "prio", "weight" etc. are optional now.
+ * doc/ip-tunnels.tex
+
+[990327]
+ * History was not recorded.
+
+[981002]
+ * Rani Assaf <rani@magic.metawire.com> contributed resolving
+ addresses to names.
+ BEWARE! DO NOT USE THIS OPTION, WHEN REPORTING BUGS IN
+ IPROUTE OR IN KERENEL. ALL THE BUG REPORTS MUST CONTAIN
+ ONLY NUMERIC ADDRESSES.
+
+[981101]
+ * now it should compile for any libc.
diff --git a/doc/Makefile b/doc/Makefile
index e69de29b..636b3288 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -0,0 +1,57 @@
+PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps
+# tc-cref.ps
+# api-rtnl.tex api-pmtudisc.tex api-news.tex
+# iki-netdev.ps iki-neighdst.ps
+
+
+LATEX=latex
+DVIPS=dvips
+SGML2DVI=sgml2latex --output=dvi
+SGML2HTML=sgml2html -s 0
+LPR=lpr -Zsduplex
+SHELL=bash
+PAGESIZE=a4
+PAGESPERPAGE=2
+
+HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml))
+DVIFILES=$(subst .ps,.dvi,$(PSFILES))
+
+
+all: pstwocol
+
+pstwocol: $(PSFILES)
+
+html: $(HTMLFILES)
+
+dvi: $(DVIFILES)
+
+print: $(PSFILES)
+ $(LPR) $(PSFILES)
+
+%.dvi: %.sgml
+ $(SGML2DVI) $<
+
+%.dvi: %.tex
+ @set -e; pass=2; echo "Running LaTeX $<"; \
+ while [ `$(LATEX) $< </dev/null 2>&1 | \
+ grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
+ if [ $$pass -gt 3 ]; then \
+ echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
+ fi; \
+ echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
+ done
+
+%.ps: %.dvi
+ $(DVIPS) $< -o $@.tmp
+ ./do-psnup $@.tmp $@ $(PAGESIZE) $(PAGESPERPAGE)
+ rm -f $@.tmp
+
+%.html: %.sgml
+ $(SGML2HTML) $<
+
+install:
+ install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR)
+ install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR)
+
+clean:
+ rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html
diff --git a/doc/Plan b/doc/Plan
index e69de29b..55f478ea 100644
--- a/doc/Plan
+++ b/doc/Plan
@@ -0,0 +1,16 @@
+Partially finished work.
+
+1. User Reference manuals.
+1.1 IP Command reference (ip-cref.tex, published)
+1.2 TC Command reference (tc-cref.tex)
+1.3 IP tunnels (ip-tunnels.tex, published)
+
+2. Linux-2.2 Networking API
+2.1 RTNETLINK (api-rtnl.tex)
+2.2 Path MTU Discovery (api-pmtudisc.tex)
+2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published)
+2.4 Miscellaneous extensions (api-misc.tex)
+
+3. Linux-2.2 Networking Intra-Kernel Interfaces
+3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex)
+3.2 Neighbour cache and destination cache. (iki-neighdst.tex)
diff --git a/doc/SNAPSHOT.tex b/doc/SNAPSHOT.tex
index e69de29b..7ed02984 100644
--- a/doc/SNAPSHOT.tex
+++ b/doc/SNAPSHOT.tex
@@ -0,0 +1 @@
+\def\Draft{020116}
diff --git a/doc/api-ip6-flowlabels.tex b/doc/api-ip6-flowlabels.tex
index e69de29b..aa34e947 100644
--- a/doc/api-ip6-flowlabels.tex
+++ b/doc/api-ip6-flowlabels.tex
@@ -0,0 +1,429 @@
+\documentstyle[12pt,twoside]{article}
+\def\TITLE{IPv6 Flow Labels}
+\input preamble
+\begin{center}
+\Large\bf IPv6 Flow Labels in Linux-2.2.
+\end{center}
+
+
+\begin{center}
+{ \large Alexey~N.~Kuznetsov } \\
+\em Institute for Nuclear Research, Moscow \\
+\verb|kuznet@ms2.inr.ac.ru| \\
+\rm April 11, 1999
+\end{center}
+
+\vspace{5mm}
+
+\tableofcontents
+
+\section{Introduction.}
+
+Every IPv6 packet carries 28 bits of flow information. RFC2460 splits
+these bits to two fields: 8 bits of traffic class (or DS field, if you
+prefer this term) and 20 bits of flow label. Currently there exist
+no well-defined API to manage IPv6 flow information. In this document
+I describe an attempt to design the API for Linux-2.2 IPv6 stack.
+
+\vskip 1mm
+
+The API must solve the following tasks:
+
+\begin{enumerate}
+
+\item To allow user to set traffic class bits.
+
+\item To allow user to read traffic class bits of received packets.
+This feature is not so useful as the first one, however it will be
+necessary f.e.\ to implement ECN [RFC2481] for datagram oriented services
+or to implement receiver side of SRP or another end-to-end protocol
+using traffic class bits.
+
+\item To assign flow labels to packets sent by user.
+
+\item To get flow labels of received packets. I do not know
+any applications of this feature, but it is possible that receiver will
+want to use flow labels to distinguish sub-flows.
+
+\item To allocate flow labels in the way, compliant to RFC2460. Namely:
+
+\begin{itemize}
+\item
+Flow labels must be uniformly distributed (pseudo-)random numbers,
+so that any subset of 20 bits can be used as hash key.
+
+\item
+Flows with coinciding source address and flow label must have identical
+destination address and not-fragmentable extensions headers (i.e.\
+hop by hop options and all the headers up to and including routing header,
+if it is present.)
+
+\begin{NB}
+There is a hole in specs: some hop-by-hop options can be
+defined only on per-packet base (f.e.\ jumbo payload option).
+Essentially, it means that such options cannot present in packets
+with flow labels.
+\end{NB}
+\begin{NB}
+NB notes here and below reflect only my personal opinion,
+they should be read with smile or should not be read at all :-).
+\end{NB}
+
+
+\item
+Flow labels have finite lifetime and source is not allowed to reuse
+flow label for another flow within the maximal lifetime has expired,
+so that intermediate nodes will be able to invalidate flow state before
+the label is taken over by another flow.
+Flow state, including lifetime, is propagated along datagram path
+by some application specific methods
+(f.e.\ in RSVP PATH messages or in some hop-by-hop option).
+
+
+\end{itemize}
+
+\end{enumerate}
+
+\section{Sending/receiving flow information.}
+
+\paragraph{Discussion.}
+\addcontentsline{toc}{subsection}{Discussion}
+It was proposed (Where? I do not remember any explicit statement)
+to solve the first four tasks using
+\verb|sin6_flowinfo| field added to \verb|struct| \verb|sockaddr_in6|
+(see RFC2553).
+
+\begin{NB}
+ This method is difficult to consider as reasonable, because it
+ puts additional overhead to all the services, despite of only
+ very small subset of them (none, to be more exact) really use it.
+ It contradicts both to IETF spirit and the letter. Before RFC2553
+ one justification existed, IPv6 address alignment left 4 byte
+ hole in \verb|sockaddr_in6| in any case. Now it has no justification.
+\end{NB}
+
+We have two problems with this method. The first one is common for all OSes:
+if \verb|recvmsg()| initializes \verb|sin6_flowinfo| to flow info
+of received packet, we loose one very important property of BSD socket API,
+namely, we are not allowed to use received address for reply directly
+and have to mangle it, even if we are not interested in flowinfo subtleties.
+
+\begin{NB}
+ RFC2553 adds new requirement: to clear \verb|sin6_flowinfo|.
+ Certainly, it is not solution but rather attempt to force applications
+ to make unnecessary work. Well, as usually, one mistake in design
+ is followed by attempts to patch the hole and more mistakes...
+\end{NB}
+
+Another problem is Linux specific. Historically Linux IPv6 did not
+initialize \verb|sin6_flowinfo| at all, so that, if kernel does not
+support flow labels, this field is not zero, but a random number.
+Some applications also did not take care about it.
+
+\begin{NB}
+Following RFC2553 such applications can be considered as broken,
+but I still think that they are right: clearing all the address
+before filling known fields is robust but stupid solution.
+Useless wasting CPU cycles and
+memory bandwidth is not a good idea. Such patches are acceptable
+as temporary hacks, but not as standard of the future.
+\end{NB}
+
+
+\paragraph{Implementation.}
+\addcontentsline{toc}{subsection}{Implementation}
+By default Linux IPv6 does not read \verb|sin6_flowinfo| field
+assuming that common applications are not obliged to initialize it
+and are permitted to consider it as pure alignment padding.
+In order to tell kernel that application
+is aware of this field, it is necessary to set socket option
+\verb|IPV6_FLOWINFO_SEND|.
+
+\begin{verbatim}
+ int on = 1;
+ setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO_SEND,
+ (void*)&on, sizeof(on));
+\end{verbatim}
+
+Linux kernel never fills \verb|sin6_flowinfo| field, when passing
+message to user space, though the kernels which support flow labels
+initialize it to zero. If user wants to get received flowinfo, he
+will set option \verb|IPV6_FLOWINFO| and after this he will receive
+flowinfo as ancillary data object of type \verb|IPV6_FLOWINFO|
+(cf.\ RFC2292).
+
+\begin{verbatim}
+ int on = 1;
+ setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO, (void*)&on, sizeof(on));
+\end{verbatim}
+
+Flowinfo received and latched by a connected TCP socket also may be fetched
+with \verb|getsockopt()| \verb|IPV6_PKTOPTIONS| together with
+another optional information.
+
+Besides that, in the spirit of RFC2292 the option \verb|IPV6_FLOWINFO|
+may be used as alternative way to send flowinfo with \verb|sendmsg()| or
+to latch it with \verb|IPV6_PKTOPTIONS|.
+
+\paragraph{Note about IPv6 options and destination address.}
+\addcontentsline{toc}{subsection}{IPv6 options and destination address}
+If \verb|sin6_flowinfo| does contain not zero flow label,
+destination address in \verb|sin6_addr| and non-fragmentable
+extension headers are ignored. Instead, kernel uses the values
+cached at flow setup (see below). However, for connected sockets
+kernel prefers the values set at connection time.
+
+\paragraph{Example.}
+\addcontentsline{toc}{subsection}{Example}
+After setting socket option \verb|IPV6_FLOWINFO|
+flowlabel and DS field are received as ancillary data object
+of type \verb|IPV6_FLOWINFO| and level \verb|SOL_IPV6|.
+In the cases when it is convenient to use \verb|recvfrom(2)|,
+it is possible to replace library variant with your own one,
+sort of:
+
+\begin{verbatim}
+#include <sys/socket.h>
+#include <netinet/in6.h>
+
+size_t recvfrom(int fd, char *buf, size_t len, int flags,
+ struct sockaddr *addr, int *addrlen)
+{
+ size_t cc;
+ char cbuf[128];
+ struct cmsghdr *c;
+ struct iovec iov = { buf, len };
+ struct msghdr msg = { addr, *addrlen,
+ &iov, 1,
+ cbuf, sizeof(cbuf),
+ 0 };
+
+ cc = recvmsg(fd, &msg, flags);
+ if (cc < 0)
+ return cc;
+ ((struct sockaddr_in6*)addr)->sin6_flowinfo = 0;
+ *addrlen = msg.msg_namelen;
+ for (c=CMSG_FIRSTHDR(&msg); c; c = CMSG_NEXTHDR(&msg, c)) {
+ if (c->cmsg_level != SOL_IPV6 ||
+ c->cmsg_type != IPV6_FLOWINFO)
+ continue;
+ ((struct sockaddr_in6*)addr)->sin6_flowinfo = *(__u32*)CMSG_DATA(c);
+ }
+ return cc;
+}
+\end{verbatim}
+
+
+
+\section{Flow label management.}
+
+\paragraph{Discussion.}
+\addcontentsline{toc}{subsection}{Discussion}
+Requirements of RFC2460 are pretty tough. Particularly, lifetimes
+longer than boot time require to store allocated labels at stable
+storage, so that the full implementation necessarily includes user space flow
+label manager. There are at least three different approaches:
+
+\begin{enumerate}
+\item {\bf ``Cooperative''. } We could leave flow label allocation wholly
+to user space. When user needs label he requests manager directly. The approach
+is valid, but as any ``cooperative'' approach it suffers of security problems.
+
+\begin{NB}
+One idea is to disallow not privileged user to allocate flow
+labels, but instead to pass the socket to manager via \verb|SCM_RIGHTS|
+control message, so that it will allocate label and assign it to socket
+itself. Hmm... the idea is interesting.
+\end{NB}
+
+\item {\bf ``Indirect''.} Kernel redirects requests to user level daemon
+and does not install label until the daemon acknowledged the request.
+The approach is the most promising, it is especially pleasant to recognize
+parallel with IPsec API [RFC2367,Craig]. Actually, it may share API with
+IPsec.
+
+\item {\bf ``Stupid''.} To allocate labels in kernel space. It is the simplest
+method, but it suffers of two serious flaws: the first,
+we cannot lease labels with lifetimes longer than boot time, the second,
+it is sensitive to DoS attacks. Kernel have to remember all the obsolete
+labels until their expiration and malicious user may fastly eat all the
+flow label space.
+
+\end{enumerate}
+
+Certainly, I choose the most ``stupid'' method. It is the cheapest one
+for implementor (i.e.\ me), and taking into account that flow labels
+still have no serious applications it is not useful to work on more
+advanced API, especially, taking into account that eventually we
+will get it for no fee together with IPsec.
+
+
+\paragraph{Implementation.}
+\addcontentsline{toc}{subsection}{Implementation}
+Socket option \verb|IPV6_FLOWLABEL_MGR| allows to
+request flow label manager to allocate new flow label, to reuse
+already allocated one or to delete old flow label.
+Its argument is \verb|struct| \verb|in6_flowlabel_req|:
+
+\begin{verbatim}
+struct in6_flowlabel_req
+{
+ struct in6_addr flr_dst;
+ __u32 flr_label;
+ __u8 flr_action;
+ __u8 flr_share;
+ __u16 flr_flags;
+ __u16 flr_expires;
+ __u16 flr_linger;
+ __u32 __flr_reserved;
+ /* Options in format of IPV6_PKTOPTIONS */
+};
+\end{verbatim}
+
+\begin{itemize}
+
+\item \verb|dst| is IPv6 destination address associated with the label.
+
+\item \verb|label| is flow label value in network byte order. If it is zero,
+kernel will allocate new pseudo-random number. Otherwise, kernel will try
+to lease flow label ordered by user. In this case, it is user task to provide
+necessary flow label randomness.
+
+\item \verb|action| is requested operation. Currently, only three operations
+are defined:
+
+\begin{verbatim}
+#define IPV6_FL_A_GET 0 /* Get flow label */
+#define IPV6_FL_A_PUT 1 /* Release flow label */
+#define IPV6_FL_A_RENEW 2 /* Update expire time */
+\end{verbatim}
+
+\item \verb|flags| are optional modifiers. Currently
+only \verb|IPV6_FL_A_GET| has modifiers:
+
+\begin{verbatim}
+#define IPV6_FL_F_CREATE 1 /* Allowed to create new label */
+#define IPV6_FL_F_EXCL 2 /* Do not create new label */
+\end{verbatim}
+
+
+\item \verb|share| defines who is allowed to reuse the same flow label.
+
+\begin{verbatim}
+#define IPV6_FL_S_NONE 0 /* Not defined */
+#define IPV6_FL_S_EXCL 1 /* Label is private */
+#define IPV6_FL_S_PROCESS 2 /* May be reused by this process */
+#define IPV6_FL_S_USER 3 /* May be reused by this user */
+#define IPV6_FL_S_ANY 255 /* Anyone may reuse it */
+\end{verbatim}
+
+\item \verb|linger| is time in seconds. After the last user releases flow
+label, it will not be reused with different destination and options at least
+during this time. If \verb|share| is not \verb|IPV6_FL_S_EXCL| the label
+still can be shared by another sockets. Current implementation does not allow
+unprivileged user to set linger longer than 60 sec.
+
+\item \verb|expires| is time in seconds. Flow label will be kept at least
+for this time, but it will not be destroyed before user released it explicitly
+or closed all the sockets using it. Current implementation does not allow
+unprivileged user to set timeout longer than 60 sec. Proviledged applications
+MAY set longer lifetimes, but in this case they MUST save allocated
+labels at stable storage and restore them back after reboot before the first
+application allocates new flow.
+
+\end{itemize}
+
+This structure is followed by optional extension headers associated
+with this flow label in format of \verb|IPV6_PKTOPTIONS|. Only
+\verb|IPV6_HOPOPTS|, \verb|IPV6_RTHDR| and, if \verb|IPV6_RTHDR| presents,
+\verb|IPV6_DSTOPTS| are allowed.
+
+\paragraph{Example.}
+\addcontentsline{toc}{subsection}{Example}
+ The function \verb|get_flow_label| allocates
+private flow label.
+
+\begin{verbatim}
+int get_flow_label(int fd, struct sockaddr_in6 *dst, __u32 fl)
+{
+ int on = 1;
+ struct in6_flowlabel_req freq;
+
+ memset(&freq, 0, sizeof(freq));
+ freq.flr_label = htonl(fl);
+ freq.flr_action = IPV6_FL_A_GET;
+ freq.flr_flags = IPV6_FL_F_CREATE | IPV6_FL_F_EXCL;
+ freq.flr_share = IPV6_FL_S_EXCL;
+ memcpy(&freq.flr_dst, &dst->sin6_addr, 16);
+ if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
+ &freq, sizeof(freq)) == -1) {
+ perror ("can't lease flowlabel");
+ return -1;
+ }
+ dst->sin6_flowinfo |= freq.flr_label;
+
+ if (setsockopt(fd, SOL_IPV6, IPV6_FLOWINFO_SEND,
+ &on, sizeof(on)) == -1) {
+ perror ("can't send flowinfo");
+
+ freq.flr_action = IPV6_FL_A_PUT;
+ setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
+ &freq, sizeof(freq));
+ return -1;
+ }
+ return 0;
+}
+\end{verbatim}
+
+A bit more complicated example using routing header can be found
+in \verb|ping6| utility (\verb|iputils| package). Linux rsvpd backend
+contains an example of using operation \verb|IPV6_FL_A_RENEW|.
+
+\paragraph{Listing flow labels.}
+\addcontentsline{toc}{subsection}{Listing flow labels}
+List of currently allocated
+flow labels may be read from \verb|/proc/net/ip6_flowlabel|.
+
+\begin{verbatim}
+Label S Owner Users Linger Expires Dst Opt
+A1BE5 1 0 0 6 3 3ffe2400000000010a0020fffe71fb30 0
+\end{verbatim}
+
+\begin{itemize}
+\item \verb|Label| is hexadecimal flow label value.
+\item \verb|S| is sharing style.
+\item \verb|Owner| is ID of creator, it is zero, pid or uid, depending on
+ sharing style.
+\item \verb|Users| is number of applications using the label now.
+\item \verb|Linger| is \verb|linger| of this label in seconds.
+\item \verb|Expires| is time until expiration of the label in seconds. It may
+ be negative, if the label is in use.
+\item \verb|Dst| is IPv6 destination address.
+\item \verb|Opt| is length of options, associated with the label. Option
+ data are not accessible.
+\end{itemize}
+
+
+\paragraph{Flow labels and RSVP.}
+\addcontentsline{toc}{subsection}{Flow labels and RSVP}
+RSVP daemon supports IPv6 flow labels
+without any modifications to standard ISI RAPI. Sender must allocate
+flow label, fill corresponding sender template and submit it to local rsvp
+daemon. rsvpd will check the label and start to announce it in PATH
+messages. Rsvpd on sender node will renew the flow label, so that it will not
+be reused before path state expires and all the intermediate
+routers and receiver purge flow state.
+
+\verb|rtap| utility is modified to parse flow labels. F.e.\ if user allocated
+flow label \verb|0xA1234|, he may write:
+
+\begin{verbatim}
+RTAP> sender 3ffe:2400::1/FL0xA1234 <Tspec>
+\end{verbatim}
+
+Receiver makes reservation with command:
+\begin{verbatim}
+RTAP> reserve ff 3ffe:2400::1/FL0xA1234 <Flowspec>
+\end{verbatim}
+
+\end{document}
diff --git a/doc/arpd.sgml b/doc/arpd.sgml
index e69de29b..0ab79c60 100644
--- a/doc/arpd.sgml
+++ b/doc/arpd.sgml
@@ -0,0 +1,130 @@
+<!doctype linuxdoc system>
+
+<article>
+
+<title>ARPD Daemon
+<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
+<date>some_negative_number, 20 Sep 2001
+<abstract>
+<tt/arpd/ is daemon collecting gratuitous ARP information, saving
+it on local disk and feeding it to kernel on demand to avoid
+redundant broadcasting due to limited size of kernel ARP cache.
+</abstract>
+
+
+<p><bf/Description/
+
+<p>The format of the command is:
+
+<tscreen><verb>
+ arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ]
+</verb></tscreen>
+
+<p> <tt/OPTIONS/ are:
+
+<itemize>
+
+<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists
+of three columns: interface index, IP address and MAC address.
+Negative entries for dead hosts are also shown, in this case MAC address
+is replaced by word <tt/FAILED/ followed by colon and time when the fact
+that host is dead was proven the last time.
+
+<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/
+in text format similar dumped by option <tt/-l/. Exit after load,
+probably listing resulting database, if option <tt/-l/ is also given.
+If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table.
+
+<item><tt/-b DATABASE/ - location of database file. Default location is
+<tt>/var/lib/arpd/arpd.db</tt>.
+
+<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but
+also send brodcast queries itself. <tt/NUMBER/ is number of such queries
+to make before destination is considered as dead. When <tt/arpd/ is started
+as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/
+or even with option <tt/-k/) without this option and still did not learn enough
+information, you can observe 1 second gaps in service. Not fatal, but
+not good.
+
+<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes
+sense together with option <tt/-a/.
+
+<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/
+suppresses further attempts to resolve for this period. It makes sense
+only together with option <tt/-k/. This timeout should not be too much
+longer than boot time of a typical host not supporting gratuitous ARP.
+Default value is 60 seconds.
+
+<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/
+in packets per second. Default value is 1.
+
+<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back.
+Default value is 3. Together with option <tt/-R/ this option allows
+to police broadcasting not to exceed <tt/B+R*T/ over any interval
+of time <tt/T/.
+
+</itemize>
+
+<p><tt/INTERFACE/ is name of networking inteface to watch.
+If no interfaces given, <tt/arpd/ monitors all the interfaces.
+In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters,
+it is supposed user does this himself after <tt/arpd/ is started.
+
+
+<p> Signals
+
+<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted
+<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/.
+<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics
+to <tt/syslog/. Effect of another signals is undefined, they may corrupt
+database and leave <tt/sysctl/ parameters in an unpredictable state.
+
+<p> Note
+
+<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be
+compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list
+is not given on command line, variable <tt/app_solicit/
+on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>.
+If this is not made <tt/arpd/ still collects gratuitous ARP information
+in its database.
+
+<p> Examples
+
+<enum>
+<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing
+with kernel functionality:
+
+<tscreen><verb>
+ arpd -b /var/tmp/arpd.db
+</verb></tscreen>
+
+<item> Look at result after some time:
+
+<tscreen><verb>
+ killall arpd
+ arpd -l -b /var/tmp/arpd.db
+</verb></tscreen>
+
+<item> To enable kernel helper, leaving leading role to kernel:
+
+<tscreen><verb>
+ arpd -b /var/tmp/arpd.db -a 1 eth0 eth1
+</verb></tscreen>
+
+<item> Completely replace kernel resolution on interfaces <tt/eth0/
+and <tt/eth1/. In this case kernel still does unicast probing to
+validate entries, but all the broadcast activity is suppressed
+and made under authority of <tt/arpd/:
+
+<tscreen><verb>
+ arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1
+</verb></tscreen>
+
+This is mode which <tt/arpd/ is supposed to work normally.
+It is not default just to prevent occasional enabling of too aggressive
+mode occasionally.
+
+</enum>
+
+</article>
+
diff --git a/doc/do-psnup b/doc/do-psnup
index e69de29b..2dce848e 100644
--- a/doc/do-psnup
+++ b/doc/do-psnup
@@ -0,0 +1,16 @@
+#! /bin/bash
+# $1 = Temporary file . "string"
+# $2 = File to process . "string"
+# $3 = Page size . ie: a4 , letter ... "string"
+# $4 = Number of pages to fit on a single sheet . "numeric"
+
+if type psnup >&/dev/null; then
+ echo "psnup -$4 -p$3 $1 $2"
+ psnup -$4 -p$3 $1 $2
+elif type psmulti >&/dev/null; then
+ echo "psmulti $1 > $2"
+ psmulti $1 > $2
+else
+ echo "cp $1 $2"
+ cp $1 $2
+fi
diff --git a/doc/ip-cref.tex b/doc/ip-cref.tex
index e69de29b..5eaa4a89 100644
--- a/doc/ip-cref.tex
+++ b/doc/ip-cref.tex
@@ -0,0 +1,3316 @@
+\documentstyle[12pt,twoside]{article}
+\def\TITLE{IP Command Reference}
+\input preamble
+\begin{center}
+\Large\bf IP Command Reference.
+\end{center}
+
+
+\begin{center}
+{ \large Alexey~N.~Kuznetsov } \\
+\em Institute for Nuclear Research, Moscow \\
+\verb|kuznet@ms2.inr.ac.ru| \\
+\rm April 14, 1999
+\end{center}
+
+\vspace{5mm}
+
+\tableofcontents
+
+\newpage
+
+\section{About this document}
+
+This document presents a comprehensive description of the \verb|ip| utility
+from the \verb|iproute2| package. It is not a tutorial or user's guide.
+It is a {\em dictionary\/}, not explaining terms,
+but translating them into other terms, which may also be unknown to the reader.
+However, the document is self-contained and the reader, provided they have a
+basic networking background, will find enough information
+and examples to understand and configure Linux-2.2 IP and IPv6
+networking.
+
+This document is split into sections explaining \verb|ip| commands
+and options, decrypting \verb|ip| output and containing a few examples.
+More voluminous examples and some topics, which require more elaborate
+discussion, are in the appendix.
+
+The paragraphs beginning with NB contain side notes, warnings about
+bugs and design drawbacks. They may be skipped at the first reading.
+
+\section{{\tt ip} --- command syntax}
+
+The generic form of an \verb|ip| command is:
+\begin{verbatim}
+ip [ OPTIONS ] OBJECT [ COMMAND [ ARGUMENTS ]]
+\end{verbatim}
+where \verb|OPTIONS| is a set of optional modifiers affecting the
+general behaviour of the \verb|ip| utility or changing its output. All options
+begin with the character \verb|'-'| and may be used in either long or abbreviated
+forms. Currently, the following options are available:
+
+\begin{itemize}
+\item \verb|-V|, \verb|-Version|
+
+--- print the version of the \verb|ip| utility and exit.
+
+
+\item \verb|-s|, \verb|-stats|, \verb|-statistics|
+
+--- output more information. If the option
+appears twice or more, the amount of information increases.
+As a rule, the information is statistics or some time values.
+
+
+\item \verb|-f|, \verb|-family| followed by a protocol family
+identifier: \verb|inet|, \verb|inet6| or \verb|link|.
+
+--- enforce the protocol family to use. If the option is not present,
+the protocol family is guessed from other arguments. If the rest of the command
+line does not give enough information to guess the family, \verb|ip| falls back to the default
+one, usually \verb|inet| or \verb|any|. \verb|link| is a special family
+identifier meaning that no networking protocol is involved.
+
+\item \verb|-4|
+
+--- shortcut for \verb|-family inet|.
+
+\item \verb|-6|
+
+--- shortcut for \verb|-family inet6|.
+
+\item \verb|-0|
+
+--- shortcut for \verb|-family link|.
+
+
+\item \verb|-o|, \verb|-oneline|
+
+--- output each record on a single line, replacing line feeds
+with the \verb|'\'| character. This is convenient when you want to
+count records with \verb|wc| or to \verb|grep| the output. The trivial
+script \verb|rtpr| converts the output back into readable form.
+
+\item \verb|-r|, \verb|-resolve|
+
+--- use the system's name resolver to print DNS names instead of
+host addresses.
+
+\begin{NB}
+ Do not use this option when reporting bugs or asking for advice.
+\end{NB}
+\begin{NB}
+ \verb|ip| never uses DNS to resolve names to addresses.
+\end{NB}
+
+\end{itemize}
+
+\verb|OBJECT| is the object to manage or to get information about.
+The object types currently understood by \verb|ip| are:
+
+\begin{itemize}
+\item \verb|link| --- network device
+\item \verb|address| --- protocol (IP or IPv6) address on a device
+\item \verb|neighbour| --- ARP or NDISC cache entry
+\item \verb|route| --- routing table entry
+\item \verb|rule| --- rule in routing policy database
+\item \verb|maddress| --- multicast address
+\item \verb|mroute| --- multicast routing cache entry
+\item \verb|tunnel| --- tunnel over IP
+\end{itemize}
+
+Again, the names of all objects may be written in full or
+abbreviated form, f.e.\ \verb|address| is abbreviated as \verb|addr|
+or just \verb|a|.
+
+\verb|COMMAND| specifies the action to perform on the object.
+The set of possible actions depends on the object type.
+As a rule, it is possible to \verb|add|, \verb|delete| and
+\verb|show| (or \verb|list|) objects, but some objects
+do not allow all of these operations or have some additional commands.
+The \verb|help| command is available for all objects. It prints
+out a list of available commands and argument syntax conventions.
+
+If no command is given, some default command is assumed.
+Usually it is \verb|list| or, if the objects of this class
+cannot be listed, \verb|help|.
+
+\verb|ARGUMENTS| is a list of arguments to the command.
+The arguments depend on the command and object. There are two types of arguments:
+{\em flags\/}, consisting of a single keyword, and {\em parameters\/},
+consisting of a keyword followed by a value. For convenience,
+each command has some {\em default parameter\/}
+which may be omitted. F.e.\ parameter \verb|dev| is the default
+for the {\tt ip link} command, so {\tt ip link ls eth0} is equivalent
+to {\tt ip link ls dev eth0}.
+In the command descriptions below such parameters
+are distinguished with the marker: ``(default)''.
+
+Almost all keywords may be abbreviated with several first (or even single)
+letters. The shortcuts are convenient when \verb|ip| is used interactively,
+but they are not recommended in scripts or when reporting bugs
+or asking for advice. ``Officially'' allowed abbreviations are listed
+in the document body.
+
+
+
+\section{{\tt ip} --- error messages}
+
+\verb|ip| may fail for one of the following reasons:
+
+\begin{itemize}
+\item
+A syntax error on the command line: an unknown keyword, incorrectly formatted
+IP address {\em et al\/}. In this case \verb|ip| prints an error message
+and exits. As a rule, the error message will contain information
+about the reason for the failure. Sometimes it also prints a help page.
+
+\item
+The arguments did not pass verification for self-consistency.
+
+\item
+\verb|ip| failed to compile a kernel request from the arguments
+because the user didn't give enough information.
+
+\item
+The kernel returned an error to some syscall. In this case \verb|ip|
+prints the error message, as it is output with \verb|perror(3)|,
+prefixed with a comment and a syscall identifier.
+
+\item
+The kernel returned an error to some RTNETLINK request.
+In this case \verb|ip| prints the error message, as it is output
+with \verb|perror(3)| prefixed with ``RTNETLINK answers:''.
+
+\end{itemize}
+
+All the operations are atomic, i.e.\
+if the \verb|ip| utility fails, it does not change anything
+in the system. One harmful exception is \verb|ip link| command
+(Sec.\ref{IP-LINK}, p.\pageref{IP-LINK}),
+which may change only some of the device parameters given
+on command line.
+
+It is difficult to list all the error messages (especially
+syntax errors). However, as a rule, their meaning is clear
+from the context of the command.
+
+The most common mistakes are:
+
+\begin{enumerate}
+\item Netlink is not configured in the kernel. The message is:
+\begin{verbatim}
+Cannot open netlink socket: Invalid value
+\end{verbatim}
+
+\item RTNETLINK is not configured in the kernel. In this case
+one of the following messages may be printed, depending on the command:
+\begin{verbatim}
+Cannot talk to rtnetlink: Connection refused
+Cannot send dump request: Connection refused
+\end{verbatim}
+
+\item The \verb|CONFIG_IP_MULTIPLE_TABLES| option was not selected
+when configuring the kernel. In this case any attempt to use the
+\verb|ip| \verb|rule| command will fail, f.e.
+\begin{verbatim}
+kuznet@kaiser $ ip rule list
+RTNETLINK error: Invalid argument
+dump terminated
+\end{verbatim}
+
+\end{enumerate}
+
+
+\section{{\tt ip link} --- network device configuration}
+\label{IP-LINK}
+
+\paragraph{Object:} A \verb|link| is a network device and the corresponding
+commands display and change the state of devices.
+
+\paragraph{Commands:} \verb|set| and \verb|show| (or \verb|list|).
+
+\subsection{{\tt ip link set} --- change device attributes}
+
+\paragraph{Abbreviations:} \verb|set|, \verb|s|.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|dev NAME| (default)
+
+--- \verb|NAME| specifies the network device on which to operate.
+
+\item \verb|up| and \verb|down|
+
+--- change the state of the device to \verb|UP| or \verb|DOWN|.
+
+\item \verb|arp on| or \verb|arp off|
+
+--- change the \verb|NOARP| flag on the device.
+
+\begin{NB}
+This operation is {\em not allowed\/} if the device is in state \verb|UP|.
+Though neither the \verb|ip| utility nor the kernel check for this condition.
+You can get unpredictable results changing this flag while the
+device is running.
+\end{NB}
+
+\item \verb|multicast on| or \verb|multicast off|
+
+--- change the \verb|MULTICAST| flag on the device.
+
+\item \verb|dynamic on| or \verb|dynamic off|
+
+--- change the \verb|DYNAMIC| flag on the device.
+
+\item \verb|name NAME|
+
+--- change the name of the device. This operation is not
+recommended if the device is running or has some addresses
+already configured.
+
+\item \verb|txqueuelen NUMBER| or \verb|txqlen NUMBER|
+
+--- change the transmit queue length of the device.
+
+\item \verb|mtu NUMBER|
+
+--- change the MTU of the device.
+
+\item \verb|address LLADDRESS|
+
+--- change the station address of the interface.
+
+\item \verb|broadcast LLADDRESS|, \verb|brd LLADDRESS| or \verb|peer LLADDRESS|
+
+--- change the link layer broadcast address or the peer address when
+the interface is \verb|POINTOPOINT|.
+
+\vskip 1mm
+\begin{NB}
+For most devices (f.e.\ for Ethernet) changing the link layer
+broadcast address will break networking.
+Do not use it, if you do not understand what this operation really does.
+\end{NB}
+
+\end{itemize}
+
+\vskip 1mm
+\begin{NB}
+The {\tt ip} utility does not change the \verb|PROMISC|
+or \verb|ALLMULTI| flags. These flags are considered
+obsolete and should not be changed administratively.
+\end{NB}
+
+\paragraph{Warning:} If multiple parameter changes are requested,
+\verb|ip| aborts immediately after any of the changes have failed.
+This is the only case when \verb|ip| can move the system to
+an unpredictable state. The solution is to avoid changing
+several parameters with one {\tt ip link set} call.
+
+\paragraph{Examples:}
+\begin{itemize}
+\item \verb|ip link set dummy address 00:00:00:00:00:01|
+
+--- change the station address of the interface \verb|dummy|.
+
+\item \verb|ip link set dummy up|
+
+--- start the interface \verb|dummy|.
+
+\end{itemize}
+
+
+\subsection{{\tt ip link show} --- display device attributes}
+\label{IP-LINK-SHOW}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|,
+\verb|l|.
+
+\paragraph{Arguments:}
+\begin{itemize}
+\item \verb|dev NAME| (default)
+
+--- \verb|NAME| specifies the network device to show.
+If this argument is omitted all devices are listed.
+
+\item \verb|up|
+
+--- only display running interfaces.
+
+\end{itemize}
+
+
+\paragraph{Output format:}
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip link ls eth0
+3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
+ link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
+kuznet@alisa:~ $ ip link ls sit0
+5: sit0@NONE: <NOARP,UP> mtu 1480 qdisc noqueue
+ link/sit 0.0.0.0 brd 0.0.0.0
+kuznet@alisa:~ $ ip link ls dummy
+2: dummy: <BROADCAST,NOARP> mtu 1500 qdisc noop
+ link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
+kuznet@alisa:~ $
+\end{verbatim}
+
+
+The number before each colon is an {\em interface index\/} or {\em ifindex\/}.
+This number uniquely identifies the interface. This is followed by the {\em interface name\/}
+(\verb|eth0|, \verb|sit0| etc.). The interface name is also
+unique at every given moment. However, the interface may disappear from the
+list (f.e.\ when the corresponding driver module is unloaded) and another
+one with the same name may be created later. Besides that,
+the administrator may change the name of any device with
+\verb|ip| \verb|link| \verb|set| \verb|name|
+to make it more intelligible.
+
+The interface name may have another name or \verb|NONE| appended
+after the \verb|@| sign. This means that this device is bound to some other
+device,
+i.e.\ packets send through it are encapsulated and sent via the ``master''
+device. If the name is \verb|NONE|, the master is unknown.
+
+Then we see the interface {\em mtu\/} (``maximal transfer unit''). This determines
+the maximal size of data which can be sent as a single packet over this interface.
+
+{\em qdisc\/} (``queuing discipline'') shows the queuing algorithm used
+on the interface. Particularly, \verb|noqueue| means that this interface
+does not queue anything and \verb|noop| means that the interface is in blackhole
+mode i.e.\ all packets sent to it are immediately discarded.
+{\em qlen\/} is the default transmit queue length of the device measured
+in packets.
+
+The interface flags are summarized in the angle brackets.
+
+\begin{itemize}
+\item \verb|UP| --- the device is turned on. It is ready to accept
+packets for transmission and it may inject into the kernel packets received
+from other nodes on the network.
+
+\item \verb|LOOPBACK| --- the interface does not communicate with other
+hosts. All packets sent through it will be returned
+and nothing but bounced packets can be received.
+
+\item \verb|BROADCAST| --- the device has the facility to send packets
+to all hosts sharing the same link. A typical example is an Ethernet link.
+
+\item \verb|POINTOPOINT| --- the link has only two ends with one node
+attached to each end. All packets sent to this link will reach the peer
+and all packets received by us came from this single peer.
+
+If neither \verb|LOOPBACK| nor \verb|BROADCAST| nor \verb|POINTOPOINT|
+are set, the interface is assumed to be NMBA (Non-Broadcast Multi-Access).
+This is the most generic type of device and the most complicated one, because
+the host attached to a NBMA link has no means to send to anyone
+without additionally configured information.
+
+\item \verb|MULTICAST| --- is an advisory flag indicating that the interface
+is aware of multicasting i.e.\ sending packets to some subset of neighbouring
+nodes. Broadcasting is a particular case of multicasting, where the multicast
+group consists of all nodes on the link. It is important to emphasize
+that software {\em must not\/} interpret the absence of this flag as the inability
+to use multicasting on this interface. Any \verb|POINTOPOINT| and
+\verb|BROADCAST| link is multicasting by definition, because we have
+direct access to all the neighbours and, hence, to any part of them.
+Certainly, the use of high bandwidth multicast transfers is not recommended
+on broadcast-only links because of high expense, but it is not strictly
+prohibited.
+
+\item \verb|PROMISC| --- the device listens to and feeds to the kernel all
+traffic on the link even if it is not destined for us, not broadcasted
+and not destined for a multicast group of which we are member. Usually
+this mode exists only on broadcast links and is used by bridges and for network
+monitoring.
+
+\item \verb|ALLMULTI| --- the device receives all multicast packets
+wandering on the link. This mode is used by multicast routers.
+
+\item \verb|NOARP| --- this flag is different from the other ones. It has
+no invariant value and its interpretation depends on the network protocols
+involved. As a rule, it indicates that the device needs no address
+resolution and that the software or hardware knows how to deliver packets
+without any help from the protocol stacks.
+
+\item \verb|DYNAMIC| --- is an advisory flag indicating that the interface is
+dynamically created and destroyed.
+
+\item \verb|SLAVE| --- this interface is bonded to some other interfaces
+to share link capacities.
+
+\end{itemize}
+
+\vskip 1mm
+\begin{NB}
+There are other flags but they are either obsolete (\verb|NOTRAILERS|)
+or not implemented (\verb|DEBUG|) or specific to some devices
+(\verb|MASTER|, \verb|AUTOMEDIA| and \verb|PORTSEL|). We do not discuss
+them here.
+\end{NB}
+\begin{NB}
+The values of \verb|PROMISC| and \verb|ALLMULTI| flags
+shown by the \verb|ifconfig| utility and by the \verb|ip| utility
+are {\em different\/}. \verb|ip link ls| shows the true device state,
+while \verb|ifconfig| shows the virtual state which was set with
+\verb|ifconfig| itself.
+\end{NB}
+
+
+The second line contains information on the link layer addresses
+associated with the device. The first word (\verb|ether|, \verb|sit|)
+defines the interface hardware type. This type determines the format and semantics
+of the addresses and is logically part of the address.
+The default format of the station address and the broadcast address
+(or the peer address for pointopoint links) is a
+sequence of hexadecimal bytes separated by colons, but some link
+types may have their natural address format, f.e.\ addresses
+of tunnels over IP are printed as dotted-quad IP addresses.
+
+\vskip 1mm
+\begin{NB}
+ NBMA links have no well-defined broadcast or peer address,
+ however this field may contain useful information, f.e.\
+ about the address of broadcast relay or about the address of the ARP server.
+\end{NB}
+\begin{NB}
+Multicast addresses are not shown by this command, see
+\verb|ip maddr ls| in~Sec.\ref{IP-MADDR} (p.\pageref{IP-MADDR} of this
+document).
+\end{NB}
+
+
+\paragraph{Statistics:} With the \verb|-statistics| option, \verb|ip| also
+prints interface statistics:
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip -s link ls eth0
+3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
+ link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
+ RX: bytes packets errors dropped overrun mcast
+ 2449949362 2786187 0 0 0 0
+ TX: bytes packets errors dropped carrier collsns
+ 178558497 1783945 332 0 332 35172
+kuznet@alisa:~ $
+\end{verbatim}
+\verb|RX:| and \verb|TX:| lines summarize receiver and transmitter
+statistics. They contain:
+\begin{itemize}
+\item \verb|bytes| --- the total number of bytes received or transmitted
+on the interface. This number wraps when the maximal length of the data type
+natural for the architecture is exceeded, so continuous monitoring requires
+a user level daemon snapping it periodically.
+\item \verb|packets| --- the total number of packets received or transmitted
+on the interface.
+\item \verb|errors| --- the total number of receiver or transmitter errors.
+\item \verb|dropped| --- the total number of packets dropped due to lack
+of resources.
+\item \verb|overrun| --- the total number of receiver overruns resulting
+in dropped packets. As a rule, if the interface is overrun, it means
+serious problems in the kernel or that your machine is too slow
+for this interface.
+\item \verb|mcast| --- the total number of received multicast packets. This option
+is only supported by a few devices.
+\item \verb|carrier| --- total number of link media failures f.e.\ because
+of lost carrier.
+\item \verb|collsns| --- the total number of collision events
+on Ethernet-like media. This number may have a different sense on other
+link types.
+\item \verb|compressed| --- the total number of compressed packets. This is
+available only for links using VJ header compression.
+\end{itemize}
+
+
+If the \verb|-s| option is entered twice or more,
+\verb|ip| prints more detailed statistics on receiver
+and transmitter errors.
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip -s -s link ls eth0
+3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
+ link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
+ RX: bytes packets errors dropped overrun mcast
+ 2449949362 2786187 0 0 0 0
+ RX errors: length crc frame fifo missed
+ 0 0 0 0 0
+ TX: bytes packets errors dropped carrier collsns
+ 178558497 1783945 332 0 332 35172
+ TX errors: aborted fifo window heartbeat
+ 0 0 0 332
+kuznet@alisa:~ $
+\end{verbatim}
+These error names are pure Ethernetisms. Other devices
+may have non zero values in these fields but they may be
+interpreted differently.
+
+
+\section{{\tt ip address} --- protocol address management}
+
+\paragraph{Abbreviations:} \verb|address|, \verb|addr|, \verb|a|.
+
+\paragraph{Object:} The \verb|address| is a protocol (IP or IPv6) address attached
+to a network device. Each device must have at least one address
+to use the corresponding protocol. It is possible to have several
+different addresses attached to one device. These addresses are not
+discriminated, so that the term {\em alias\/} is not quite appropriate
+for them and we do not use it in this document.
+
+The \verb|ip addr| command displays addresses and their properties,
+adds new addresses and deletes old ones.
+
+\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|flush| and \verb|show|
+(or \verb|list|).
+
+
+\subsection{{\tt ip address add} --- add a new protocol address}
+\label{IP-ADDR-ADD}
+
+\paragraph{Abbreviations:} \verb|add|, \verb|a|.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|dev NAME|
+
+\noindent--- the name of the device to add the address to.
+
+\item \verb|local ADDRESS| (default)
+
+--- the address of the interface. The format of the address depends
+on the protocol. It is a dotted quad for IP and a sequence of hexadecimal halfwords
+separated by colons for IPv6. The \verb|ADDRESS| may be followed by
+a slash and a decimal number which encodes the network prefix length.
+
+
+\item \verb|peer ADDRESS|
+
+--- the address of the remote endpoint for pointopoint interfaces.
+Again, the \verb|ADDRESS| may be followed by a slash and a decimal number,
+encoding the network prefix length. If a peer address is specified,
+the local address {\em cannot\/} have a prefix length. The network prefix is associated
+with the peer rather than with the local address.
+
+
+\item \verb|broadcast ADDRESS|
+
+--- the broadcast address on the interface.
+
+It is possible to use the special symbols \verb|'+'| and \verb|'-'|
+instead of the broadcast address. In this case, the broadcast address
+is derived by setting/resetting the host bits of the interface prefix.
+
+\vskip 1mm
+\begin{NB}
+Unlike \verb|ifconfig|, the \verb|ip| utility {\em does not\/} set any broadcast
+address unless explicitly requested.
+\end{NB}
+
+
+\item \verb|label NAME|
+
+--- Each address may be tagged with a label string.
+In order to preserve compatibility with Linux-2.0 net aliases,
+this string must coincide with the name of the device or must be prefixed
+with the device name followed by colon.
+
+
+\item \verb|scope SCOPE_VALUE|
+
+--- the scope of the area where this address is valid.
+The available scopes are listed in file \verb|/etc/iproute2/rt_scopes|.
+Predefined scope values are:
+
+ \begin{itemize}
+ \item \verb|global| --- the address is globally valid.
+ \item \verb|site| --- (IPv6 only) the address is site local,
+ i.e.\ it is valid inside this site.
+ \item \verb|link| --- the address is link local, i.e.\
+ it is valid only on this device.
+ \item \verb|host| --- the address is valid only inside this host.
+ \end{itemize}
+
+Appendix~\ref{ADDR-SEL} (p.\pageref{ADDR-SEL} of this document)
+contains more details on address scopes.
+
+\end{itemize}
+
+\paragraph{Examples:}
+\begin{itemize}
+\item \verb|ip addr add 127.0.0.1/8 dev lo brd + scope host|
+
+--- add the usual loopback address to the loopback device.
+
+\item \verb|ip addr add 10.0.0.1/24 brd + dev eth0 label eth0:Alias|
+
+--- add the address 10.0.0.1 with prefix length 24 (i.e.\ netmask
+\verb|255.255.255.0|), standard broadcast and label \verb|eth0:Alias|
+to the interface \verb|eth0|.
+\end{itemize}
+
+
+\subsection{{\tt ip address delete} --- delete a protocol address}
+
+\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|.
+
+\paragraph{Arguments:} coincide with the arguments of \verb|ip addr add|.
+The device name is a required argument. The rest are optional.
+If no arguments are given, the first address is deleted.
+
+\paragraph{Examples:}
+\begin{itemize}
+\item \verb|ip addr del 127.0.0.1/8 dev lo|
+
+--- deletes the loopback address from the loopback device.
+It would be best not to repeat this experiment.
+
+\item Disable IP on the interface \verb|eth0|:
+\begin{verbatim}
+ while ip -f inet addr del dev eth0; do
+ : nothing
+ done
+\end{verbatim}
+Another method to disable IP on an interface using {\tt ip addr flush}
+may be found in sec.\ref{IP-ADDR-FLUSH}, p.\pageref{IP-ADDR-FLUSH}.
+
+\end{itemize}
+
+
+\subsection{{\tt ip address show} --- display protocol addresses}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|,
+\verb|l|.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|dev NAME| (default)
+
+--- the name of the device.
+
+\item \verb|scope SCOPE_VAL|
+
+--- only list addresses with this scope.
+
+\item \verb|to PREFIX|
+
+--- only list addresses matching this prefix.
+
+\item \verb|label PATTERN|
+
+--- only list addresses with labels matching the \verb|PATTERN|.
+\verb|PATTERN| is a usual shell style pattern.
+
+
+\item \verb|dynamic| and \verb|permanent|
+
+--- (IPv6 only) only list addresses installed due to stateless
+address configuration or only list permanent (not dynamic) addresses.
+
+\item \verb|tentative|
+
+--- (IPv6 only) only list addresses which did not pass duplicate
+address detection.
+
+\item \verb|deprecated|
+
+--- (IPv6 only) only list deprecated addresses.
+
+
+\item \verb|primary| and \verb|secondary|
+
+--- only list primary (or secondary) addresses.
+
+\end{itemize}
+
+
+\paragraph{Output format:}
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip addr ls eth0
+3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
+ link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
+ inet 193.233.7.90/24 brd 193.233.7.255 scope global eth0
+ inet6 3ffe:2400:0:1:2a0:ccff:fe66:1878/64 scope global dynamic
+ valid_lft forever preferred_lft 604746sec
+ inet6 fe80::2a0:ccff:fe66:1878/10 scope link
+kuznet@alisa:~ $
+\end{verbatim}
+
+The first two lines coincide with the output of \verb|ip link ls|.
+It is natural to interpret link layer addresses
+as addresses of the protocol family \verb|AF_PACKET|.
+
+Then the list of IP and IPv6 addresses follows, accompanied by
+additional address attributes: scope value (see Sec.\ref{IP-ADDR-ADD},
+p.\pageref{IP-ADDR-ADD} above), flags and the address label.
+
+Address flags are set by the kernel and cannot be changed
+administratively. Currently, the following flags are defined:
+
+\begin{enumerate}
+\item \verb|secondary|
+
+--- the address is not used when selecting the default source address
+of outgoing packets (Cf.\ Appendix~\ref{ADDR-SEL}, p.\pageref{ADDR-SEL}.).
+An IP address becomes secondary if another address with the same
+prefix bits already exists. The first address is primary.
+It is the leader of the group of all secondary addresses. When the leader
+is deleted, all secondaries are purged too.
+
+
+\item \verb|dynamic|
+
+--- the address was created due to stateless autoconfiguration~\cite{RFC-ADDRCONF}.
+In this case the output also contains information on times, when
+the address is still valid. After \verb|preferred_lft| expires the address is
+moved to the deprecated state. After \verb|valid_lft| expires the address
+is finally invalidated.
+
+\item \verb|deprecated|
+
+--- the address is deprecated, i.e.\ it is still valid, but cannot
+be used by newly created connections.
+
+\item \verb|tentative|
+
+--- the address is not used because duplicate address detection~\cite{RFC-ADDRCONF}
+is still not complete or failed.
+
+\end{enumerate}
+
+
+\subsection{{\tt ip address flush} --- flush protocol addresses}
+\label{IP-ADDR-FLUSH}
+
+\paragraph{Abbreviations:} \verb|flush|, \verb|f|.
+
+\paragraph{Description:}This command flushes the protocol addresses
+selected by some criteria.
+
+\paragraph{Arguments:} This command has the same arguments as \verb|show|.
+The difference is that it does not run when no arguments are given.
+
+\paragraph{Warning:} This command (and other \verb|flush| commands
+described below) is pretty dangerous. If you make a mistake, it will
+not forgive it, but will cruelly purge all the addresses.
+
+\paragraph{Statistics:} With the \verb|-statistics| option, the command
+becomes verbose. It prints out the number of deleted addresses and the number
+of rounds made to flush the address list. If this option is given
+twice, \verb|ip addr flush| also dumps all the deleted addresses
+in the format described in the previous subsection.
+
+\paragraph{Example:} Delete all the addresses from the private network
+10.0.0.0/8:
+\begin{verbatim}
+netadm@amber:~ # ip -s -s a f to 10/8
+2: dummy inet 10.7.7.7/16 brd 10.7.255.255 scope global dummy
+3: eth0 inet 10.10.7.7/16 brd 10.10.255.255 scope global eth0
+4: eth1 inet 10.8.7.7/16 brd 10.8.255.255 scope global eth1
+
+*** Round 1, deleting 3 addresses ***
+*** Flush is complete after 1 round ***
+netadm@amber:~ #
+\end{verbatim}
+Another instructive example is disabling IP on all the Ethernets:
+\begin{verbatim}
+netadm@amber:~ # ip -4 addr flush label "eth*"
+\end{verbatim}
+And the last example shows how to flush all the IPv6 addresses
+acquired by the host from stateless address autoconfiguration
+after you enabled forwarding or disabled autoconfiguration.
+\begin{verbatim}
+netadm@amber:~ # ip -6 addr flush dynamic
+\end{verbatim}
+
+
+
+\section{{\tt ip neighbour} --- neighbour/arp tables management}
+
+\paragraph{Abbreviations:} \verb|neighbour|, \verb|neighbor|, \verb|neigh|,
+\verb|n|.
+
+\paragraph{Object:} \verb|neighbour| objects establish bindings between protocol
+addresses and link layer addresses for hosts sharing the same link.
+Neighbour entries are organized into tables. The IPv4 neighbour table
+is known by another name --- the ARP table.
+
+The corresponding commands display neighbour bindings
+and their properties, add new neighbour entries and delete old ones.
+
+\paragraph{Commands:} \verb|add|, \verb|change|, \verb|replace|,
+\verb|delete|, \verb|flush| and \verb|show| (or \verb|list|).
+
+\paragraph{See also:} Appendix~\ref{PROXY-NEIGH}, p.\pageref{PROXY-NEIGH}
+describes how to manage proxy ARP/NDISC with the \verb|ip| utility.
+
+
+\subsection{{\tt ip neighbour add} --- add a new neighbour entry\\
+ {\tt ip neighbour change} --- change an existing entry\\
+ {\tt ip neighbour replace} --- add a new entry or change an existing one}
+
+\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|;
+\verb|replace|, \verb|repl|.
+
+\paragraph{Description:} These commands create new neighbour records
+or update existing ones.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|to ADDRESS| (default)
+
+--- the protocol address of the neighbour. It is either an IPv4 or IPv6 address.
+
+\item \verb|dev NAME|
+
+--- the interface to which this neighbour is attached.
+
+
+\item \verb|lladdr LLADDRESS|
+
+--- the link layer address of the neighbour. \verb|LLADDRESS| can also be
+\verb|null|.
+
+\item \verb|nud NUD_STATE|
+
+--- the state of the neighbour entry. \verb|nud| is an abbreviation for ``Neighbour
+Unreachability Detection''. The state can take one of the following values:
+
+\begin{enumerate}
+\item \verb|permanent| --- the neighbour entry is valid forever and can be only be removed
+administratively.
+\item \verb|noarp| --- the neighbour entry is valid. No attempts to validate
+this entry will be made but it can be removed when its lifetime expires.
+\item \verb|reachable| --- the neighbour entry is valid until the reachability
+timeout expires.
+\item \verb|stale| --- the neighbour entry is valid but suspicious.
+This option to \verb|ip neigh| does not change the neighbour state if
+it was valid and the address is not changed by this command.
+\end{enumerate}
+
+\end{itemize}
+
+\paragraph{Examples:}
+\begin{itemize}
+\item \verb|ip neigh add 10.0.0.3 lladdr 0:0:0:0:0:1 dev eth0 nud perm|
+
+--- add a permanent ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|.
+
+\item \verb|ip neigh chg 10.0.0.3 dev eth0 nud reachable|
+
+--- change its state to \verb|reachable|.
+\end{itemize}
+
+
+\subsection{{\tt ip neighbour delete} --- delete a neighbour entry}
+
+\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|.
+
+\paragraph{Description:} This command invalidates a neighbour entry.
+
+\paragraph{Arguments:} The arguments are the same as with \verb|ip neigh add|,
+except that \verb|lladdr| and \verb|nud| are ignored.
+
+
+\paragraph{Example:}
+\begin{itemize}
+\item \verb|ip neigh del 10.0.0.3 dev eth0|
+
+--- invalidate an ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|.
+
+\end{itemize}
+
+\begin{NB}
+ The deleted neighbour entry will not disappear from the tables
+ immediately. If it is in use it cannot be deleted until the last
+ client releases it. Otherwise it will be destroyed during
+ the next garbage collection.
+\end{NB}
+
+
+\paragraph{Warning:} Attempts to delete or manually change
+a \verb|noarp| entry created by the kernel may result in unpredictable behaviour.
+Particularly, the kernel may try to resolve this address even
+on a \verb|NOARP| interface or if the address is multicast or broadcast.
+
+
+\subsection{{\tt ip neighbour show} --- list neighbour entries}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|.
+
+\paragraph{Description:}This commands displays neighbour tables.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+
+\item \verb|to ADDRESS| (default)
+
+--- the prefix selecting the neighbours to list.
+
+\item \verb|dev NAME|
+
+--- only list the neighbours attached to this device.
+
+\item \verb|unused|
+
+--- only list neighbours which are not currently in use.
+
+\item \verb|nud NUD_STATE|
+
+--- only list neighbour entries in this state. \verb|NUD_STATE| takes
+values listed below or the special value \verb|all| which means all states.
+This option may occur more than once. If this option is absent, \verb|ip|
+lists all entries except for \verb|none| and \verb|noarp|.
+
+\end{itemize}
+
+
+\paragraph{Output format:}
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip neigh ls
+:: dev lo lladdr 00:00:00:00:00:00 nud noarp
+fe80::200:cff:fe76:3f85 dev eth0 lladdr 00:00:0c:76:3f:85 router \
+ nud stale
+0.0.0.0 dev lo lladdr 00:00:00:00:00:00 nud noarp
+193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 nud reachable
+193.233.7.85 dev eth0 lladdr 00:e0:1e:63:39:00 nud stale
+kuznet@alisa:~ $
+\end{verbatim}
+
+The first word of each line is the protocol address of the neighbour.
+Then the device name follows. The rest of the line describes the contents of
+the neighbour entry identified by the pair (device, address).
+
+\verb|lladdr| is the link layer address of the neighbour.
+
+\verb|nud| is the state of the ``neighbour unreachability detection'' machine
+for this entry. The detailed description of the neighbour
+state machine can be found in~\cite{RFC-NDISC}. Here is the full list
+of the states with short descriptions:
+
+\begin{enumerate}
+\item\verb|none| --- the state of the neighbour is void.
+\item\verb|incomplete| --- the neighbour is in the process of resolution.
+\item\verb|reachable| --- the neighbour is valid and apparently reachable.
+\item\verb|stale| --- the neighbour is valid, but is probably already
+unreachable, so the kernel will try to check it at the first transmission.
+\item\verb|delay| --- a packet has been sent to the stale neighbour and the kernel is waiting
+for confirmation.
+\item\verb|probe| --- the delay timer expired but no confirmation was received.
+The kernel has started to probe the neighbour with ARP/NDISC messages.
+\item\verb|failed| --- resolution has failed.
+\item\verb|noarp| --- the neighbour is valid. No attempts to check the entry
+will be made.
+\item\verb|permanent| --- it is a \verb|noarp| entry, but only the administrator
+may remove the entry from the neighbour table.
+\end{enumerate}
+
+The link layer address is valid in all states except for \verb|none|,
+\verb|failed| and \verb|incomplete|.
+
+IPv6 neighbours can be marked with the additional flag \verb|router|
+which means that the neighbour introduced itself as an IPv6 router~\cite{RFC-NDISC}.
+
+\paragraph{Statistics:} The \verb|-statistics| option displays some usage
+statistics, f.e.\
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip -s n ls 193.233.7.254
+193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \
+ nud reachable
+kuznet@alisa:~ $
+\end{verbatim}
+
+Here \verb|ref| is the number of users of this entry
+and \verb|used| is a triplet of time intervals in seconds
+separated by slashes. In this case they show that:
+
+\begin{enumerate}
+\item the entry was used 12 seconds ago.
+\item the entry was confirmed 13 seconds ago.
+\item the entry was updated 20 seconds ago.
+\end{enumerate}
+
+\subsection{{\tt ip neighbour flush} --- flush neighbour entries}
+
+\paragraph{Abbreviations:} \verb|flush|, \verb|f|.
+
+\paragraph{Description:}This command flushes neighbour tables, selecting
+entries to flush by some criteria.
+
+\paragraph{Arguments:} This command has the same arguments as \verb|show|.
+The differences are that it does not run when no arguments are given,
+and that the default neighbour states to be flushed do not include
+\verb|permanent| and \verb|noarp|.
+
+
+\paragraph{Statistics:} With the \verb|-statistics| option, the command
+becomes verbose. It prints out the number of deleted neighbours and the number
+of rounds made to flush the neighbour table. If the option is given
+twice, \verb|ip neigh flush| also dumps all the deleted neighbours
+in the format described in the previous subsection.
+
+\paragraph{Example:}
+\begin{verbatim}
+netadm@alisa:~ # ip -s -s n f 193.233.7.254
+193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \
+ nud reachable
+
+*** Round 1, deleting 1 entries ***
+*** Flush is complete after 1 round ***
+netadm@alisa:~ #
+\end{verbatim}
+
+
+\section{{\tt ip route} --- routing table management}
+\label{IP-ROUTE}
+
+\paragraph{Abbreviations:} \verb|route|, \verb|ro|, \verb|r|.
+
+\paragraph{Object:} \verb|route| entries in the kernel routing tables keep
+information about paths to other networked nodes.
+
+Each route entry has a {\em key\/} consisting of a {\em prefix\/}
+(i.e.\ a pair containing a network address and the length of its mask) and,
+optionally, the TOS value. An IP packet matches the route if the highest
+bits of its destination address are equal to the route prefix at least
+up to the prefix length and if the TOS of the route is zero or equal to
+the TOS of the packet.
+
+If several routes match the packet, the following pruning rules
+are used to select the best one (see~\cite{RFC1812}):
+\begin{enumerate}
+\item The longest matching prefix is selected. All shorter ones
+are dropped.
+
+\item If the TOS of some route with the longest prefix is equal to the TOS
+of the packet, the routes with different TOS are dropped.
+
+If no exact TOS match was found and routes with TOS=0 exist,
+the rest of routes are pruned.
+
+Otherwise, the route lookup fails.
+
+\item If several routes remain after the previous steps, then
+the routes with the best preference values are selected.
+
+\item If we still have several routes, then the {\em first\/} of them
+is selected.
+
+\begin{NB}
+ Note the ambiguity of the last step. Unfortunately, Linux
+ historically allows such a bizarre situation. The sense of the
+word ``first'' depends on the order of route additions and it is practically
+impossible to maintain a bundle of such routes in this order.
+\end{NB}
+
+For simplicity we will limit ourselves to the case where such a situation
+is impossible and routes are uniquely identified by the triplet
+\{prefix, tos, preference\}. Actually, it is impossible to create
+non-unique routes with \verb|ip| commands described in this section.
+
+One useful exception to this rule is the default route on non-forwarding
+hosts. It is ``officially'' allowed to have several fallback routes
+when several routers are present on directly connected networks.
+In this case, Linux-2.2 makes ``dead gateway detection''~\cite{RFC1122}
+controlled by neighbour unreachability detection and by advice
+from transport protocols to select a working router, so the order
+of the routes is not essential. However, in this case,
+fiddling with default routes manually is not recommended. Use the Router Discovery
+protocol (see Appendix~\ref{EXAMPLE-SETUP}, p.\pageref{EXAMPLE-SETUP})
+instead. Actually, Linux-2.2 IPv6 does not give user level applications
+any access to default routes.
+\end{enumerate}
+
+Certainly, the steps above are not performed exactly
+in this sequence. Instead, the routing table in the kernel is kept
+in some data structure to achieve the final result
+with minimal cost. However, not depending on a particular
+routing algorithm implemented in the kernel, we can summarize
+the statements above as: a route is identified by the triplet
+\{prefix, tos, preference\}. This {\em key\/} lets us locate
+the route in the routing table.
+
+\paragraph{Route attributes:} Each route key refers to a routing
+information record containing
+the data required to deliver IP packets (f.e.\ output device and
+next hop router) and some optional attributes (f.e. the path MTU or
+the preferred source address when communicating with this destination).
+These attributes are described in the following subsection.
+
+\paragraph{Route types:} \label{IP-ROUTE-TYPES}
+It is important that the set
+of required and optional attributes depend on the route {\em type\/}.
+The most important route type
+is \verb|unicast|. It describes real paths to other hosts.
+As a rule, common routing tables contain only such routes. However,
+there are other types of routes with different semantics. The
+full list of types understood by Linux-2.2 is:
+\begin{itemize}
+\item \verb|unicast| --- the route entry describes real paths to the
+destinations covered by the route prefix.
+\item \verb|unreachable| --- these destinations are unreachable. Packets
+are discarded and the ICMP message {\em host unreachable\/} is generated.
+The local senders get an \verb|EHOSTUNREACH| error.
+\item \verb|blackhole| --- these destinations are unreachable. Packets
+are discarded silently. The local senders get an \verb|EINVAL| error.
+\item \verb|prohibit| --- these destinations are unreachable. Packets
+are discarded and the ICMP message {\em communication administratively
+prohibited\/} is generated. The local senders get an \verb|EACCES| error.
+\item \verb|local| --- the destinations are assigned to this
+host. The packets are looped back and delivered locally.
+\item \verb|broadcast| --- the destinations are broadcast addresses.
+The packets are sent as link broadcasts.
+\item \verb|throw| --- a special control route used together with policy
+rules (see sec.\ref{IP-RULE}, p.\pageref{IP-RULE}). If such a route is selected, lookup
+in this table is terminated pretending that no route was found.
+Without policy routing it is equivalent to the absence of the route in the routing
+table. The packets are dropped and the ICMP message {\em net unreachable\/}
+is generated. The local senders get an \verb|ENETUNREACH| error.
+\item \verb|nat| --- a special NAT route. Destinations covered by the prefix
+are considered to be dummy (or external) addresses which require translation
+to real (or internal) ones before forwarding. The addresses to translate to
+are selected with the attribute \verb|via|. More about NAT is
+in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}.
+\item \verb|anycast| --- ({\em not implemented\/}) the destinations are
+{\em anycast\/} addresses assigned to this host. They are mainly equivalent
+to \verb|local| with one difference: such addresses are invalid when used
+as the source address of any packet.
+\item \verb|multicast| --- a special type used for multicast routing.
+It is not present in normal routing tables.
+\end{itemize}
+
+\paragraph{Route tables:} Linux-2.2 can pack routes into several routing
+tables identified by a number in the range from 1 to 255 or by
+name from the file \verb|/etc/iproute2/rt_tables|. By default all normal
+routes are inserted into the \verb|main| table (ID 254) and the kernel only uses
+this table when calculating routes.
+
+Actually, one other table always exists, which is invisible but
+even more important. It is the \verb|local| table (ID 255). This table
+consists of routes for local and broadcast addresses. The kernel maintains
+this table automatically and the administrator usually need not modify it
+or even look at it.
+
+The multiple routing tables enter the game when {\em policy routing\/}
+is used. See sec.\ref{IP-RULE}, p.\pageref{IP-RULE}.
+In this case, the table identifier effectively becomes
+one more parameter, which should be added to the triplet
+\{prefix, tos, preference\} to uniquely identify the route.
+
+
+\subsection{{\tt ip route add} --- add a new route\\
+ {\tt ip route change} --- change a route\\
+ {\tt ip route replace} --- change a route or add a new one}
+\label{IP-ROUTE-ADD}
+
+\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|;
+ \verb|replace|, \verb|repl|.
+
+
+\paragraph{Arguments:}
+\begin{itemize}
+\item \verb|to PREFIX| or \verb|to TYPE PREFIX| (default)
+
+--- the destination prefix of the route. If \verb|TYPE| is omitted,
+\verb|ip| assumes type \verb|unicast|. Other values of \verb|TYPE|
+are listed above. \verb|PREFIX| is an IP or IPv6 address optionally followed
+by a slash and the prefix length. If the length of the prefix is missing,
+\verb|ip| assumes a full-length host route. There is also a special
+\verb|PREFIX| --- \verb|default| --- which is equivalent to IP \verb|0/0| or
+to IPv6 \verb|::/0|.
+
+\item \verb|tos TOS| or \verb|dsfield TOS|
+
+--- the Type Of Service (TOS) key. This key has no associated mask and
+the longest match is understood as: First, compare the TOS
+of the route and of the packet. If they are not equal, then the packet
+may still match a route with a zero TOS. \verb|TOS| is either an 8 bit hexadecimal
+number or an identifier from {\tt /etc/iproute2/rt\_dsfield}.
+
+
+\item \verb|metric NUMBER| or \verb|preference NUMBER|
+
+--- the preference value of the route. \verb|NUMBER| is an arbitrary 32bit number.
+
+\item \verb|table TABLEID|
+
+--- the table to add this route to.
+\verb|TABLEID| may be a number or a string from the file
+\verb|/etc/iproute2/rt_tables|. If this parameter is omitted,
+\verb|ip| assumes the \verb|main| table, with the exception of
+\verb|local|, \verb|broadcast| and \verb|nat| routes, which are
+put into the \verb|local| table by default.
+
+\item \verb|dev NAME|
+
+--- the output device name.
+
+\item \verb|via ADDRESS|
+
+--- the address of the nexthop router. Actually, the sense of this field depends
+on the route type. For normal \verb|unicast| routes it is either the true nexthop
+router or, if it is a direct route installed in BSD compatibility mode,
+it can be a local address of the interface.
+For NAT routes it is the first address of the block of translated IP destinations.
+
+\item \verb|src ADDRESS|
+
+--- the source address to prefer when sending to the destinations
+covered by the route prefix.
+
+\item \verb|realm REALMID|
+
+--- the realm to which this route is assigned.
+\verb|REALMID| may be a number or a string from the file
+\verb|/etc/iproute2/rt_realms|. Sec.\ref{RT-REALMS} (p.\pageref{RT-REALMS})
+contains more information on realms.
+
+\item \verb|mtu MTU| or \verb|mtu lock MTU|
+
+--- the MTU along the path to the destination. If the modifier \verb|lock| is
+not used, the MTU may be updated by the kernel due to Path MTU Discovery.
+If the modifier \verb|lock| is used, no path MTU discovery will be tried,
+all packets will be sent without the DF bit in IPv4 case
+or fragmented to MTU for IPv6.
+
+\item \verb|window NUMBER|
+
+--- the maximal window for TCP to advertise to these destinations,
+measured in bytes. It limits maximal data bursts that our TCP
+peers are allowed to send to us.
+
+\item \verb|rtt NUMBER|
+
+--- the initial RTT (``Round Trip Time'') estimate.
+
+
+\item \verb|rttvar NUMBER|
+
+--- \threeonly the initial RTT variance estimate.
+
+
+\item \verb|ssthresh NUMBER|
+
+--- \threeonly an estimate for the initial slow start threshold.
+
+
+\item \verb|cwnd NUMBER|
+
+--- \threeonly the clamp for congestion window. It is ignored if the \verb|lock|
+ flag is not used.
+
+
+\item \verb|advmss NUMBER|
+
+--- \threeonly the MSS (``Maximal Segment Size'') to advertise to these
+ destinations when establishing TCP connections. If it is not given,
+ Linux uses a default value calculated from the first hop device MTU.
+
+\begin{NB}
+ If the path to these destination is asymmetric, this guess may be wrong.
+\end{NB}
+
+\item \verb|reordering NUMBER|
+
+--- \threeonly Maximal reordering on the path to this destination.
+ If it is not given, Linux uses the value selected with \verb|sysctl|
+ variable \verb|net/ipv4/tcp_reordering|.
+
+
+
+\item \verb|nexthop NEXTHOP|
+
+--- the nexthop of a multipath route. \verb|NEXTHOP| is a complex value
+with its own syntax similar to the top level argument lists:
+\begin{itemize}
+\item \verb|via ADDRESS| is the nexthop router.
+\item \verb|dev NAME| is the output device.
+\item \verb|weight NUMBER| is a weight for this element of a multipath
+route reflecting its relative bandwidth or quality.
+\end{itemize}
+
+\item \verb|scope SCOPE_VAL|
+
+--- the scope of the destinations covered by the route prefix.
+\verb|SCOPE_VAL| may be a number or a string from the file
+\verb|/etc/iproute2/rt_scopes|.
+If this parameter is omitted,
+\verb|ip| assumes scope \verb|global| for all gatewayed \verb|unicast|
+routes, scope \verb|link| for direct \verb|unicast| and \verb|broadcast| routes
+and scope \verb|host| for \verb|local| routes.
+
+\item \verb|protocol RTPROTO|
+
+--- the routing protocol identifier of this route.
+\verb|RTPROTO| may be a number or a string from the file
+\verb|/etc/iproute2/rt_protos|. If the routing protocol ID is
+not given, \verb|ip| assumes protocol \verb|boot| (i.e.\
+it assumes the route was added by someone who doesn't
+understand what they are doing). Several protocol values have a fixed interpretation.
+Namely:
+\begin{itemize}
+\item \verb|redirect| --- the route was installed due to an ICMP redirect.
+\item \verb|kernel| --- the route was installed by the kernel during
+autoconfiguration.
+\item \verb|boot| --- the route was installed during the bootup sequence.
+If a routing daemon starts, it will purge all of them.
+\item \verb|static| --- the route was installed by the administrator
+to override dynamic routing. Routing daemon will respect them
+and, probably, even advertise them to its peers.
+\item \verb|ra| --- the route was installed by Router Discovery protocol.
+\end{itemize}
+The rest of the values are not reserved and the administrator is free
+to assign (or not to assign) protocol tags. At least, routing
+daemons should take care of setting some unique protocol values,
+f.e.\ as they are assigned in \verb|rtnetlink.h| or in \verb|rt_protos|
+database.
+
+
+\item \verb|onlink|
+
+--- pretend that the nexthop is directly attached to this link,
+even if it does not match any interface prefix. One application of this
+option may be found in~\cite{IP-TUNNELS}.
+
+\item \verb|equalize|
+
+--- allow packet by packet randomization on multipath routes.
+Without this modifier, the route will be frozen to one selected
+nexthop, so that load splitting will only occur on per-flow base.
+\verb|equalize| only works if the kernel is patched.
+
+
+\end{itemize}
+
+
+\begin{NB}
+ Actually there are more commands: \verb|prepend| does the same
+ thing as classic \verb|route add|, i.e.\ adds a route, even if another
+ route to the same destination exists. Its opposite case is \verb|append|,
+ which adds the route to the end of the list. Avoid these
+ features.
+\end{NB}
+\begin{NB}
+ More sad news, IPv6 only understands the \verb|append| command correctly.
+ All the others are translated into \verb|append| commands. Certainly,
+ this will change in the future.
+\end{NB}
+
+\paragraph{Examples:}
+\begin{itemize}
+\item add a plain route to network 10.0.0/24 via gateway 193.233.7.65
+\begin{verbatim}
+ ip route add 10.0.0/24 via 193.233.7.65
+\end{verbatim}
+\item change it to a direct route via the \verb|dummy| device
+\begin{verbatim}
+ ip ro chg 10.0.0/24 dev dummy
+\end{verbatim}
+\item add a default multipath route splitting the load between \verb|ppp0|
+and \verb|ppp1|
+\begin{verbatim}
+ ip route add default scope global nexthop dev ppp0 \
+ nexthop dev ppp1
+\end{verbatim}
+Note the scope value. It is not necessary but it informs the kernel
+that this route is gatewayed rather than direct. Actually, if you
+know the addresses of remote endpoints it would be better to use the
+\verb|via| parameter.
+\item announce that the address 192.203.80.144 is not a real one, but
+should be translated to 193.233.7.83 before forwarding
+\begin{verbatim}
+ ip route add nat 192.203.80.144 via 193.233.7.83
+\end{verbatim}
+Backward translation is setup with policy rules described
+in the following section (sec.\ref{IP-RULE}, p.\pageref{IP-RULE}).
+\end{itemize}
+
+\subsection{{\tt ip route delete} --- delete a route}
+
+\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|.
+
+\paragraph{Arguments:} \verb|ip route del| has the same arguments as
+\verb|ip route add|, but their semantics are a bit different.
+
+Key values (\verb|to|, \verb|tos|, \verb|preference| and \verb|table|)
+select the route to delete. If optional attributes are present, \verb|ip|
+verifies that they coincide with the attributes of the route to delete.
+If no route with the given key and attributes was found, \verb|ip route del|
+fails.
+\begin{NB}
+Linux-2.0 had the option to delete a route selected only by prefix address,
+ignoring its length (i.e.\ netmask). This option no longer exists
+because it was ambiguous. However, look at {\tt ip route flush}
+(sec.\ref{IP-ROUTE-FLUSH}, p.\pageref{IP-ROUTE-FLUSH}) which
+provides similar and even richer functionality.
+\end{NB}
+
+\paragraph{Example:}
+\begin{itemize}
+\item delete the multipath route created by the command in previous subsection
+\begin{verbatim}
+ ip route del default scope global nexthop dev ppp0 \
+ nexthop dev ppp1
+\end{verbatim}
+\end{itemize}
+
+
+
+\subsection{{\tt ip route show} --- list routes}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
+
+\paragraph{Description:} the command displays the contents of the routing tables
+or the route(s) selected by some criteria.
+
+
+\paragraph{Arguments:}
+\begin{itemize}
+\item \verb|to SELECTOR| (default)
+
+--- only select routes from the given range of destinations. \verb|SELECTOR|
+consists of an optional modifier (\verb|root|, \verb|match| or \verb|exact|)
+and a prefix. \verb|root PREFIX| selects routes with prefixes not shorter
+than \verb|PREFIX|. F.e.\ \verb|root 0/0| selects the entire routing table.
+\verb|match PREFIX| selects routes with prefixes not longer than
+\verb|PREFIX|. F.e.\ \verb|match 10.0/16| selects \verb|10.0/16|,
+\verb|10/8| and \verb|0/0|, but it does not select \verb|10.1/16| and
+\verb|10.0.0/24|. And \verb|exact PREFIX| (or just \verb|PREFIX|)
+selects routes with this exact prefix. If neither of these options
+are present, \verb|ip| assumes \verb|root 0/0| i.e.\ it lists the entire table.
+
+
+\item \verb|tos TOS| or \verb|dsfield TOS|
+
+ --- only select routes with the given TOS.
+
+
+\item \verb|table TABLEID|
+
+ --- show the routes from this table(s). The default setting is to show
+\verb|table| \verb|main|. \verb|TABLEID| may either be the ID of a real table
+or one of the special values:
+ \begin{itemize}
+ \item \verb|all| --- list all of the tables.
+ \item \verb|cache| --- dump the routing cache.
+ \end{itemize}
+\begin{NB}
+ IPv6 has a single table. However, splitting it into \verb|main|, \verb|local|
+ and \verb|cache| is emulated by the \verb|ip| utility.
+\end{NB}
+
+\item \verb|cloned| or \verb|cached|
+
+--- list cloned routes i.e.\ routes which were dynamically forked from
+other routes because some route attribute (f.e.\ MTU) was updated.
+Actually, it is equivalent to \verb|table cache|.
+
+\item \verb|from SELECTOR|
+
+--- the same syntax as for \verb|to|, but it binds the source address range
+rather than destinations. Note that the \verb|from| option only works with
+cloned routes.
+
+\item \verb|protocol RTPROTO|
+
+--- only list routes of this protocol.
+
+
+\item \verb|scope SCOPE_VAL|
+
+--- only list routes with this scope.
+
+\item \verb|type TYPE|
+
+--- only list routes of this type.
+
+\item \verb|dev NAME|
+
+--- only list routes going via this device.
+
+\item \verb|via PREFIX|
+
+--- only list routes going via the nexthop routers selected by \verb|PREFIX|.
+
+\item \verb|src PREFIX|
+
+--- only list routes with preferred source addresses selected
+by \verb|PREFIX|.
+
+\item \verb|realm REALMID| or \verb|realms FROMREALM/TOREALM|
+
+--- only list routes with these realms.
+
+\end{itemize}
+
+\paragraph{Examples:} Let us count routes of protocol \verb|gated/bgp|
+on a router:
+\begin{verbatim}
+kuznet@amber:~ $ ip ro ls proto gated/bgp | wc
+ 1413 9891 79010
+kuznet@amber:~ $
+\end{verbatim}
+To count the size of the routing cache, we have to use the \verb|-o| option
+because cached attributes can take more than one line of output:
+\begin{verbatim}
+kuznet@amber:~ $ ip -o ro ls cloned | wc
+ 159 2543 18707
+kuznet@amber:~ $
+\end{verbatim}
+
+
+\paragraph{Output format:} The output of this command consists
+of per route records separated by line feeds.
+However, some records may consist
+of more than one line: particularly, this is the case when the route
+is cloned or you requested additional statistics. If the
+\verb|-o| option was given, then line feeds separating lines inside
+records are replaced with the backslash sign.
+
+The output has the same syntax as arguments given to {\tt ip route add},
+so that it can be understood easily. F.e.\
+\begin{verbatim}
+kuznet@amber:~ $ ip ro ls 193.233.7/24
+193.233.7.0/24 dev eth0 proto gated/conn scope link \
+ src 193.233.7.65 realms inr.ac
+kuznet@amber:~ $
+\end{verbatim}
+
+If you list cloned entries, the output contains other attributes which
+are evaluated during route calculation and updated during route
+lifetime. An example of the output is:
+\begin{verbatim}
+kuznet@amber:~ $ ip ro ls 193.233.7.82 tab cache
+193.233.7.82 from 193.233.7.82 dev eth0 src 193.233.7.65 \
+ realms inr.ac/inr.ac
+ cache <src-direct,redirect> mtu 1500 rtt 300 iif eth0
+193.233.7.82 dev eth0 src 193.233.7.65 realms inr.ac
+ cache mtu 1500 rtt 300
+kuznet@amber:~ $
+\end{verbatim}
+\begin{NB}
+ \label{NB-strange-route}
+ The route looks a bit strange, doesn't it? Did you notice that
+ it is a path from 193.233.7.82 back to 193.233.82? Well, you will
+ see in the section on \verb|ip route get| (p.\pageref{NB-nature-of-strangeness})
+ how it appeared.
+\end{NB}
+The second line, starting with the word \verb|cache|, shows
+additional attributes which normal routes do not possess.
+Cached flags are summarized in angle brackets:
+\begin{itemize}
+\item \verb|local| --- packets are delivered locally.
+It stands for loopback unicast routes, for broadcast routes
+and for multicast routes, if this host is a member of the corresponding
+group.
+
+\item \verb|reject| --- the path is bad. Any attempt to use it results
+in an error. See attribute \verb|error| below (p.\pageref{IP-ROUTE-GET-error}).
+
+\item \verb|mc| --- the destination is multicast.
+
+\item \verb|brd| --- the destination is broadcast.
+
+\item \verb|src-direct| --- the source is on a directly connected
+interface.
+
+\item \verb|redirected| --- the route was created by an ICMP Redirect.
+
+\item \verb|redirect| --- packets going via this route will
+trigger an ICMP redirect.
+
+\item \verb|fastroute| --- the route is eligible to be used for fastroute.
+
+\item \verb|equalize| --- make packet by packet randomization
+along this path.
+
+\item \verb|dst-nat| --- the destination address requires translation.
+
+\item \verb|src-nat| --- the source address requires translation.
+
+\item \verb|masq| --- the source address requires masquerading.
+This feature disappeared in linux-2.4.
+
+\item \verb|notify| --- ({\em not implemented}) change/deletion
+of this route will trigger RTNETLINK notification.
+\end{itemize}
+
+Then some optional attributes follow:
+\begin{itemize}
+\item \verb|error| --- on \verb|reject| routes it is error code
+returned to local senders when they try to use this route.
+These error codes are translated into ICMP error codes, sent to remote
+senders, according to the rules described above in the subsection
+devoted to route types (p.\pageref{IP-ROUTE-TYPES}).
+\label{IP-ROUTE-GET-error}
+
+\item \verb|expires| --- this entry will expire after this timeout.
+
+\item \verb|iif| --- the packets for this path are expected to arrive
+on this interface.
+\end{itemize}
+
+\paragraph{Statistics:} With the \verb|-statistics| option, more
+information about this route is shown:
+\begin{itemize}
+\item \verb|users| --- the number of users of this entry.
+\item \verb|age| --- shows when this route was last used.
+\item \verb|used| --- the number of lookups of this route since its creation.
+\end{itemize}
+
+
+\subsection{{\tt ip route flush} --- flush routing tables}
+\label{IP-ROUTE-FLUSH}
+
+\paragraph{Abbreviations:} \verb|flush|, \verb|f|.
+
+\paragraph{Description:} this command flushes routes selected
+by some criteria.
+
+\paragraph{Arguments:} the arguments have the same syntax and semantics
+as the arguments of \verb|ip route show|, but routing tables are not
+listed but purged. The only difference is the default action: \verb|show|
+dumps all the IP main routing table but \verb|flush| prints the helper page.
+The reason for this difference does not require any explanation, does it?
+
+
+\paragraph{Statistics:} With the \verb|-statistics| option, the command
+becomes verbose. It prints out the number of deleted routes and the number
+of rounds made to flush the routing table. If the option is given
+twice, \verb|ip route flush| also dumps all the deleted routes
+in the format described in the previous subsection.
+
+\paragraph{Examples:} The first example flushes all the
+gatewayed routes from the main table (f.e.\ after a routing daemon crash).
+\begin{verbatim}
+netadm@amber:~ # ip -4 ro flush scope global type unicast
+\end{verbatim}
+This option deserves to be put into a scriptlet \verb|routef|.
+\begin{NB}
+This option was described in the \verb|route(8)| man page borrowed
+from BSD, but was never implemented in Linux.
+\end{NB}
+
+The second example flushes all IPv6 cloned routes:
+\begin{verbatim}
+netadm@amber:~ # ip -6 -s -s ro flush cache
+3ffe:2400::220:afff:fef4:c5d1 via 3ffe:2400::220:afff:fef4:c5d1 \
+ dev eth0 metric 0
+ cache used 2 age 12sec mtu 1500 rtt 300
+3ffe:2400::280:adff:feb7:8034 via 3ffe:2400::280:adff:feb7:8034 \
+ dev eth0 metric 0
+ cache used 2 age 15sec mtu 1500 rtt 300
+3ffe:2400::280:c8ff:fe59:5bcc via 3ffe:2400::280:c8ff:fe59:5bcc \
+ dev eth0 metric 0
+ cache users 1 used 1 age 23sec mtu 1500 rtt 300
+3ffe:2400:0:1:2a0:ccff:fe66:1878 via 3ffe:2400:0:1:2a0:ccff:fe66:1878 \
+ dev eth1 metric 0
+ cache used 2 age 20sec mtu 1500 rtt 300
+3ffe:2400:0:1:a00:20ff:fe71:fb30 via 3ffe:2400:0:1:a00:20ff:fe71:fb30 \
+ dev eth1 metric 0
+ cache used 2 age 33sec mtu 1500 rtt 300
+ff02::1 via ff02::1 dev eth1 metric 0
+ cache users 1 used 1 age 45sec mtu 1500 rtt 300
+
+*** Round 1, deleting 6 entries ***
+*** Flush is complete after 1 round ***
+netadm@amber:~ # ip -6 -s -s ro flush cache
+Nothing to flush.
+netadm@amber:~ #
+\end{verbatim}
+
+The third example flushes BGP routing tables after a \verb|gated|
+death.
+\begin{verbatim}
+netadm@amber:~ # ip ro ls proto gated/bgp | wc
+ 1408 9856 78730
+netadm@amber:~ # ip -s ro f proto gated/bgp
+
+*** Round 1, deleting 1408 entries ***
+*** Flush is complete after 1 round ***
+netadm@amber:~ # ip ro f proto gated/bgp
+Nothing to flush.
+netadm@amber:~ # ip ro ls proto gated/bgp
+netadm@amber:~ #
+\end{verbatim}
+
+
+\subsection{{\tt ip route get} --- get a single route}
+\label{IP-ROUTE-GET}
+
+\paragraph{Abbreviations:} \verb|get|, \verb|g|.
+
+\paragraph{Description:} this command gets a single route to a destination
+and prints its contents exactly as the kernel sees it.
+
+\paragraph{Arguments:}
+\begin{itemize}
+\item \verb|to ADDRESS| (default)
+
+--- the destination address.
+
+\item \verb|from ADDRESS|
+
+--- the source address.
+
+\item \verb|tos TOS| or \verb|dsfield TOS|
+
+--- the Type Of Service.
+
+\item \verb|iif NAME|
+
+--- the device from which this packet is expected to arrive.
+
+\item \verb|oif NAME|
+
+--- force the output device on which this packet will be routed.
+
+\item \verb|connected|
+
+--- if no source address (option \verb|from|) was given, relookup
+the route with the source set to the preferred address received from the first lookup.
+If policy routing is used, it may be a different route.
+
+\end{itemize}
+
+Note that this operation is not equivalent to \verb|ip route show|.
+\verb|show| shows existing routes. \verb|get| resolves them and
+creates new clones if necessary. Essentially, \verb|get|
+is equivalent to sending a packet along this path.
+If the \verb|iif| argument is not given, the kernel creates a route
+to output packets towards the requested destination.
+This is equivalent to pinging the destination
+with a subsequent {\tt ip route ls cache}, however, no packets are
+actually sent. With the \verb|iif| argument, the kernel pretends
+that a packet arrived from this interface and searches for
+a path to forward the packet.
+
+\paragraph{Output format:} This command outputs routes in the same
+format as \verb|ip route ls|.
+
+\paragraph{Examples:}
+\begin{itemize}
+\item Find a route to output packets to 193.233.7.82:
+\begin{verbatim}
+kuznet@amber:~ $ ip route get 193.233.7.82
+193.233.7.82 dev eth0 src 193.233.7.65 realms inr.ac
+ cache mtu 1500 rtt 300
+kuznet@amber:~ $
+\end{verbatim}
+
+\item Find a route to forward packets arriving on \verb|eth0|
+from 193.233.7.82 and destined for 193.233.7.82:
+\begin{verbatim}
+kuznet@amber:~ $ ip r g 193.233.7.82 from 193.233.7.82 iif eth0
+193.233.7.82 from 193.233.7.82 dev eth0 src 193.233.7.65 \
+ realms inr.ac/inr.ac
+ cache <src-direct,redirect> mtu 1500 rtt 300 iif eth0
+kuznet@amber:~ $
+\end{verbatim}
+\begin{NB}
+ \label{NB-nature-of-strangeness}
+ This is the command that created the funny route from 193.233.7.82
+ looped back to 193.233.7.82 (cf.\ NB on~p.\pageref{NB-strange-route}).
+ Note the \verb|redirect| flag on it.
+\end{NB}
+
+\item Find a multicast route for packets arriving on \verb|eth0|
+from host 193.233.7.82 and destined for multicast group 224.2.127.254
+(it is assumed that a multicast routing daemon is running.
+In this case, it is \verb|pimd|)
+\begin{verbatim}
+kuznet@amber:~ $ ip r g 224.2.127.254 from 193.233.7.82 iif eth0
+multicast 224.2.127.254 from 193.233.7.82 dev lo \
+ src 193.233.7.65 realms inr.ac/cosmos
+ cache <mc> iif eth0 Oifs: eth1 pimreg
+kuznet@amber:~ $
+\end{verbatim}
+This route differs from the ones seen before. It contains a ``normal'' part
+and a ``multicast'' part. The normal part is used to deliver (or not to
+deliver) the packet to local IP listeners. In this case the router
+is not a member
+of this group, so that route has no \verb|local| flag and only
+forwards packets. The output device for such entries is always loopback.
+The multicast part consists of an additional \verb|Oifs:| list showing
+the output interfaces.
+\end{itemize}
+
+
+It is time for a more complicated example. Let us add an invalid
+gatewayed route for a destination which is really directly connected:
+\begin{verbatim}
+netadm@alisa:~ # ip route add 193.233.7.98 via 193.233.7.254
+netadm@alisa:~ # ip route get 193.233.7.98
+193.233.7.98 via 193.233.7.254 dev eth0 src 193.233.7.90
+ cache mtu 1500 rtt 3072
+netadm@alisa:~ #
+\end{verbatim}
+and probe it with ping:
+\begin{verbatim}
+netadm@alisa:~ # ping -n 193.233.7.98
+PING 193.233.7.98 (193.233.7.98) from 193.233.7.90 : 56 data bytes
+From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98)
+64 bytes from 193.233.7.98: icmp_seq=0 ttl=255 time=3.5 ms
+From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98)
+64 bytes from 193.233.7.98: icmp_seq=1 ttl=255 time=2.2 ms
+64 bytes from 193.233.7.98: icmp_seq=2 ttl=255 time=0.4 ms
+64 bytes from 193.233.7.98: icmp_seq=3 ttl=255 time=0.4 ms
+64 bytes from 193.233.7.98: icmp_seq=4 ttl=255 time=0.4 ms
+^C
+--- 193.233.7.98 ping statistics ---
+5 packets transmitted, 5 packets received, 0% packet loss
+round-trip min/avg/max = 0.4/1.3/3.5 ms
+netadm@alisa:~ #
+\end{verbatim}
+What happened? Router 193.233.7.254 understood that we have a much
+better path to the destination and sent us an ICMP redirect message.
+We may retry \verb|ip route get| to see what we have in the routing
+tables now:
+\begin{verbatim}
+netadm@alisa:~ # ip route get 193.233.7.98
+193.233.7.98 dev eth0 src 193.233.7.90
+ cache <redirected> mtu 1500 rtt 3072
+netadm@alisa:~ #
+\end{verbatim}
+
+
+
+\section{{\tt ip rule} --- routing policy database management}
+\label{IP-RULE}
+
+\paragraph{Abbreviations:} \verb|rule|, \verb|ru|.
+
+\paragraph{Object:} \verb|rule|s in the routing policy database control
+the route selection algorithm.
+
+Classic routing algorithms used in the Internet make routing decisions
+based only on the destination address of packets (and in theory,
+but not in practice, on the TOS field). The seminal review of classic
+routing algorithms and their modifications can be found in~\cite{RFC1812}.
+
+In some circumstances we want to route packets differently depending not only
+on destination addresses, but also on other packet fields: source address,
+IP protocol, transport protocol ports or even packet payload.
+This task is called ``policy routing''.
+
+\begin{NB}
+ ``policy routing'' $\neq$ ``routing policy''.
+
+\noindent ``policy routing'' $=$ ``cunning routing''.
+
+\noindent ``routing policy'' $=$ ``routing tactics'' or ``routing plan''.
+\end{NB}
+
+To solve this task, the conventional destination based routing table, ordered
+according to the longest match rule, is replaced with a ``routing policy
+database'' (or RPDB), which selects routes
+by executing some set of rules. The rules may have lots of keys of different
+natures and therefore they have no natural ordering, but one imposed
+by the administrator. Linux-2.2 RPDB is a linear list of rules
+ordered by numeric priority value.
+RPDB explicitly allows matching a few packet fields:
+
+\begin{itemize}
+\item packet source address.
+\item packet destination address.
+\item TOS.
+\item incoming interface (which is packet metadata, rather than a packet field).
+\end{itemize}
+
+Matching IP protocols and transport ports is also possible,
+indirectly, via \verb|ipchains|, by exploiting their ability
+to mark some classes of packets with \verb|fwmark|. Therefore,
+\verb|fwmark| is also included in the set of keys checked by rules.
+
+Each policy routing rule consists of a {\em selector\/} and an {\em action\/}
+predicate. The RPDB is scanned in the order of increasing priority. The selector
+of each rule is applied to \{source address, destination address, incoming
+interface, tos, fwmark\} and, if the selector matches the packet,
+the action is performed. The action predicate may return with success.
+In this case, it will either give a route or failure indication
+and the RPDB lookup is terminated. Otherwise, the RPDB program
+continues on the next rule.
+
+What is the action, semantically? The natural action is to select the
+nexthop and the output device. This is what
+Cisco IOS~\cite{IOS} does. Let us call it ``match \& set''.
+The Linux-2.2 approach is more flexible. The action includes
+lookups in destination-based routing tables and selecting
+a route from these tables according to the classic longest match algorithm.
+The ``match \& set'' approach is the simplest case of the Linux one. It is realized
+when a second level routing table contains a single default route.
+Recall that Linux-2.2 supports multiple tables
+managed with the \verb|ip route| command, described in the previous section.
+
+At startup time the kernel configures the default RPDB consisting of three
+rules:
+
+\begin{enumerate}
+\item Priority: 0, Selector: match anything, Action: lookup routing
+table \verb|local| (ID 255).
+The \verb|local| table is a special routing table containing
+high priority control routes for local and broadcast addresses.
+
+Rule 0 is special. It cannot be deleted or overridden.
+
+
+\item Priority: 32766, Selector: match anything, Action: lookup routing
+table \verb|main| (ID 254).
+The \verb|main| table is the normal routing table containing all non-policy
+routes. This rule may be deleted and/or overridden with other
+ones by the administrator.
+
+\item Priority: 32767, Selector: match anything, Action: lookup routing
+table \verb|default| (ID 253).
+The \verb|default| table is empty. It is reserved for some
+post-processing if no previous default rules selected the packet.
+This rule may also be deleted.
+
+\end{enumerate}
+
+Do not confuse routing tables with rules: rules point to routing tables,
+several rules may refer to one routing table and some routing tables
+may have no rules pointing to them. If the administrator deletes all the rules
+referring to a table, the table is not used, but it still exists
+and will disappear only after all the routes contained in it are deleted.
+
+
+\paragraph{Rule attributes:} Each RPDB entry has additional
+attributes. F.e.\ each rule has a pointer to some routing
+table. NAT and masquerading rules have an attribute to select new IP
+address to translate/masquerade. Besides that, rules have some
+optional attributes, which routes have, namely \verb|realms|.
+These values do not override those contained in the routing tables. They
+are only used if the route did not select any attributes.
+
+
+\paragraph{Rule types:} The RPDB may contain rules of the following
+types:
+\begin{itemize}
+\item \verb|unicast| --- the rule prescribes to return the route found
+in the routing table referenced by the rule.
+\item \verb|blackhole| --- the rule prescribes to silently drop the packet.
+\item \verb|unreachable| --- the rule prescribes to generate a ``Network
+is unreachable'' error.
+\item \verb|prohibit| --- the rule prescribes to generate
+``Communication is administratively prohibited'' error.
+\item \verb|nat| --- the rule prescribes to translate the source address
+of the IP packet into some other value. More about NAT is
+in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}.
+\end{itemize}
+
+
+\paragraph{Commands:} \verb|add|, \verb|delete| and \verb|show|
+(or \verb|list|).
+
+\subsection{{\tt ip rule add} --- insert a new rule\\
+ {\tt ip rule delete} --- delete a rule}
+\label{IP-RULE-ADD}
+
+\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|,
+ \verb|d|.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|type TYPE| (default)
+
+--- the type of this rule. The list of valid types was given in the previous
+subsection.
+
+\item \verb|from PREFIX|
+
+--- select the source prefix to match.
+
+\item \verb|to PREFIX|
+
+--- select the destination prefix to match.
+
+\item \verb|iif NAME|
+
+--- select the incoming device to match. If the interface is loopback,
+the rule only matches packets originating from this host. This means that you
+may create separate routing tables for forwarded and local packets and,
+hence, completely segregate them.
+
+\item \verb|tos TOS| or \verb|dsfield TOS|
+
+--- select the TOS value to match.
+
+\item \verb|fwmark MARK|
+
+--- select the \verb|fwmark| value to match.
+
+\item \verb|priority PREFERENCE|
+
+--- the priority of this rule. Each rule should have an explicitly
+set {\em unique\/} priority value.
+\begin{NB}
+ Really, for historical reasons \verb|ip rule add| does not require a
+ priority value and allows them to be non-unique.
+ If the user does not supplied a priority, it is selected by the kernel.
+ If the user creates a rule with a priority value that
+ already exists, the kernel does not reject the request. It adds
+ the new rule before all old rules of the same priority.
+
+ It is mistake in design, no more. And it will be fixed one day,
+ so do not rely on this feature. Use explicit priorities.
+\end{NB}
+
+
+\item \verb|table TABLEID|
+
+--- the routing table identifier to lookup if the rule selector matches.
+
+\item \verb|realms FROM/TO|
+
+--- Realms to select if the rule matched and the routing table lookup
+succeeded. Realm \verb|TO| is only used if the route did not select
+any realm.
+
+\item \verb|nat ADDRESS|
+
+--- The base of the IP address block to translate (for source addresses).
+The \verb|ADDRESS| may be either the start of the block of NAT addresses
+(selected by NAT routes) or in linux-2.2 a local host address (or even zero).
+In the last case the router does not translate the packets,
+but masquerades them to this address; this feature disappered in 2.4.
+More about NAT is in Appendix~\ref{ROUTE-NAT},
+p.\pageref{ROUTE-NAT}.
+
+\end{itemize}
+
+\paragraph{Warning:} Changes to the RPDB made with these commands
+do not become active immediately. It is assumed that after
+a script finishes a batch of updates, it flushes the routing cache
+with \verb|ip route flush cache|.
+
+\paragraph{Examples:}
+\begin{itemize}
+\item Route packets with source addresses from 192.203.80/24
+according to routing table \verb|inr.ruhep|:
+\begin{verbatim}
+ip ru add from 192.203.80.0/24 table inr.ruhep prio 220
+\end{verbatim}
+
+\item Translate packet source address 193.233.7.83 into 192.203.80.144
+and route it according to table \#1 (actually, it is \verb|inr.ruhep|):
+\begin{verbatim}
+ip ru add from 193.233.7.83 nat 192.203.80.144 table 1 prio 320
+\end{verbatim}
+
+\item Delete the unused default rule:
+\begin{verbatim}
+ip ru del prio 32767
+\end{verbatim}
+
+\end{itemize}
+
+
+
+\subsection{{\tt ip rule show} --- list rules}
+\label{IP-RULE-SHOW}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
+
+
+\paragraph{Arguments:} Good news, this is one command that has no arguments.
+
+\paragraph{Output format:}
+
+\begin{verbatim}
+kuznet@amber:~ $ ip ru ls
+0: from all lookup local
+200: from 192.203.80.0/24 to 193.233.7.0/24 lookup main
+210: from 192.203.80.0/24 to 192.203.80.0/24 lookup main
+220: from 192.203.80.0/24 lookup inr.ruhep realms inr.ruhep/radio-msu
+300: from 193.233.7.83 to 193.233.7.0/24 lookup main
+310: from 193.233.7.83 to 192.203.80.0/24 lookup main
+320: from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144
+32766: from all lookup main
+kuznet@amber:~ $
+\end{verbatim}
+
+In the first column is the rule priority value followed
+by a colon. Then the selectors follow. Each key is prefixed
+with the same keyword that was used to create the rule.
+
+The keyword \verb|lookup| is followed by a routing table identifier,
+as it is recorded in the file \verb|/etc/iproute2/rt_tables|.
+
+If the rule does NAT (f.e.\ rule \#320), it is shown by the keyword
+\verb|map-to| followed by the start of the block of addresses to map.
+
+The sense of this example is pretty simple. The prefixes
+192.203.80.0/24 and 193.233.7.0/24 form the internal network, but
+they are routed differently when the packets leave it.
+Besides that, the host 193.233.7.83 is translated into
+another prefix to look like 192.203.80.144 when talking
+to the outer world.
+
+
+
+\section{{\tt ip maddress} --- multicast addresses management}
+\label{IP-MADDR}
+
+\paragraph{Object:} \verb|maddress| objects are multicast addresses.
+
+\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|show| (or \verb|list|).
+
+\subsection{{\tt ip maddress show} --- list multicast addresses}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+
+\item \verb|dev NAME| (default)
+
+--- the device name.
+
+\end{itemize}
+
+\paragraph{Output format:}
+
+\begin{verbatim}
+kuznet@alisa:~ $ ip maddr ls dummy
+2: dummy
+ link 33:33:00:00:00:01
+ link 01:00:5e:00:00:01
+ inet 224.0.0.1 users 2
+ inet6 ff02::1
+kuznet@alisa:~ $
+\end{verbatim}
+
+The first line of the output shows the interface index and its name.
+Then the multicast address list follows. Each line starts with the
+protocol identifier. The word \verb|link| denotes a link layer
+multicast addresses.
+
+If a multicast address has more than one user, the number
+of users is shown after the \verb|users| keyword.
+
+One additional feature not present in the example above
+is the \verb|static| flag, which indicates that the address was joined
+with \verb|ip maddr add|. See the following subsection.
+
+
+
+\subsection{{\tt ip maddress add} --- add a multicast address\\
+ {\tt ip maddress delete} --- delete a multicast address}
+
+\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|, \verb|d|.
+
+\paragraph{Description:} these commands attach/detach
+a static link layer multicast address to listen on the interface.
+Note that it is impossible to join protocol multicast groups
+statically. This command only manages link layer addresses.
+
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|address LLADDRESS| (default)
+
+--- the link layer multicast address.
+
+\item \verb|dev NAME|
+
+--- the device to join/leave this multicast address.
+
+\end{itemize}
+
+
+\paragraph{Example:} Let us continue with the example from the previous subsection.
+
+\begin{verbatim}
+netadm@alisa:~ # ip maddr add 33:33:00:00:00:01 dev dummy
+netadm@alisa:~ # ip -0 maddr ls dummy
+2: dummy
+ link 33:33:00:00:00:01 users 2 static
+ link 01:00:5e:00:00:01
+netadm@alisa:~ # ip maddr del 33:33:00:00:00:01 dev dummy
+\end{verbatim}
+
+\begin{NB}
+ Neither \verb|ip| nor the kernel check for multicast address validity.
+ Particularly, this means that you can try to load a unicast address
+ instead of a multicast address. Most drivers will ignore such addresses,
+ but several (f.e.\ Tulip) will intern it to their on-board filter.
+ The effects may be strange. Namely, the addresses become additional
+ local link addresses and, if you loaded the address of another host
+ to the router, wait for duplicated packets on the wire.
+ It is not a bug, but rather a hole in the API and intra-kernel interfaces.
+ This feature is really more useful for traffic monitoring, but using it
+ with Linux-2.2 you {\em have to\/} be sure that the host is not
+ a router and, especially, that it is not a transparent proxy or masquerading
+ agent.
+\end{NB}
+
+
+
+\section{{\tt ip mroute} --- multicast routing cache management}
+\label{IP-MROUTE}
+
+\paragraph{Abbreviations:} \verb|mroute|, \verb|mr|.
+
+\paragraph{Object:} \verb|mroute| objects are multicast routing cache
+entries created by a user level mrouting daemon
+(f.e.\ \verb|pimd| or \verb|mrouted|).
+
+Due to the limitations of the current interface to the multicast routing
+engine, it is impossible to change \verb|mroute| objects administratively,
+so we may only display them. This limitation will be removed
+in the future.
+
+\paragraph{Commands:} \verb|show| (or \verb|list|).
+
+
+\subsection{{\tt ip mroute show} --- list mroute cache entries}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+\item \verb|to PREFIX| (default)
+
+--- the prefix selecting the destination multicast addresses to list.
+
+
+\item \verb|iif NAME|
+
+--- the interface on which multicast packets are received.
+
+
+\item \verb|from PREFIX|
+
+--- the prefix selecting the IP source addresses of the multicast route.
+
+
+\end{itemize}
+
+\paragraph{Output format:}
+
+\begin{verbatim}
+kuznet@amber:~ $ ip mroute ls
+(193.232.127.6, 224.0.1.39) Iif: unresolved
+(193.232.244.34, 224.0.1.40) Iif: unresolved
+(193.233.7.65, 224.66.66.66) Iif: eth0 Oifs: pimreg
+kuznet@amber:~ $
+\end{verbatim}
+
+Each line shows one (S,G) entry in the multicast routing cache,
+where S is the source address and G is the multicast group. \verb|Iif| is
+the interface on which multicast packets are expected to arrive.
+If the word \verb|unresolved| is there instead of the interface name,
+it means that the routing daemon still hasn't resolved this entry.
+The keyword \verb|oifs| is followed by a list of output interfaces, separated
+by spaces. If a multicast routing entry is created with non-trivial
+TTL scope, administrative distances are appended to the device names
+in the \verb|oifs| list.
+
+\paragraph{Statistics:} The \verb|-statistics| option also prints the
+number of packets and bytes forwarded along this route and
+the number of packets that arrived on the wrong interface, if this number is not zero.
+
+\begin{verbatim}
+kuznet@amber:~ $ ip -s mr ls 224.66/16
+(193.233.7.65, 224.66.66.66) Iif: eth0 Oifs: pimreg
+ 9383 packets, 300256 bytes
+kuznet@amber:~ $
+\end{verbatim}
+
+
+\section{{\tt ip tunnel} --- tunnel configuration}
+\label{IP-TUNNEL}
+
+\paragraph{Abbreviations:} \verb|tunnel|, \verb|tunl|.
+
+\paragraph{Object:} \verb|tunnel| objects are tunnels, encapsulating
+packets in IPv4 packets and then sending them over the IP infrastructure.
+
+\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|change|, \verb|show|
+(or \verb|list|).
+
+\paragraph{See also:} A more informal discussion of tunneling
+over IP and the \verb|ip tunnel| command can be found in~\cite{IP-TUNNELS}.
+
+\subsection{{\tt ip tunnel add} --- add a new tunnel\\
+ {\tt ip tunnel change} --- change an existing tunnel\\
+ {\tt ip tunnel delete} --- destroy a tunnel}
+
+\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|;
+\verb|delete|, \verb|del|, \verb|d|.
+
+
+\paragraph{Arguments:}
+
+\begin{itemize}
+
+\item \verb|name NAME| (default)
+
+--- select the tunnel device name.
+
+\item \verb|mode MODE|
+
+--- set the tunnel mode. Three modes are currently available:
+ \verb|ipip|, \verb|sit| and \verb|gre|.
+
+\item \verb|remote ADDRESS|
+
+--- set the remote endpoint of the tunnel.
+
+\item \verb|local ADDRESS|
+
+--- set the fixed local address for tunneled packets.
+It must be an address on another interface of this host.
+
+\item \verb|ttl N|
+
+--- set a fixed TTL \verb|N| on tunneled packets.
+ \verb|N| is a number in the range 1--255. 0 is a special value
+ meaning that packets inherit the TTL value.
+ The default value is: \verb|inherit|.
+
+\item \verb|tos T| or \verb|dsfield T|
+
+--- set a fixed TOS \verb|T| on tunneled packets.
+ The default value is: \verb|inherit|.
+
+
+
+\item \verb|dev NAME|
+
+--- bind the tunnel to the device \verb|NAME| so that
+ tunneled packets will only be routed via this device and will
+ not be able to escape to another device when the route to endpoint changes.
+
+\item \verb|nopmtudisc|
+
+--- disable Path MTU Discovery on this tunnel.
+ It is enabled by default. Note that a fixed ttl is incompatible
+ with this option: tunnelling with a fixed ttl always makes pmtu discovery.
+
+\item \verb|key K|, \verb|ikey K|, \verb|okey K|
+
+--- (only GRE tunnels) use keyed GRE with key \verb|K|. \verb|K| is
+ either a number or an IP address-like dotted quad.
+ The \verb|key| parameter sets the key to use in both directions.
+ The \verb|ikey| and \verb|okey| parameters set different keys for input and output.
+
+
+\item \verb|csum|, \verb|icsum|, \verb|ocsum|
+
+--- (only GRE tunnels) generate/require checksums for tunneled packets.
+ The \verb|ocsum| flag calculates checksums for outgoing packets.
+ The \verb|icsum| flag requires that all input packets have the correct
+ checksum. The \verb|csum| flag is equivalent to the combination
+ ``\verb|icsum| \verb|ocsum|''.
+
+\item \verb|seq|, \verb|iseq|, \verb|oseq|
+
+--- (only GRE tunnels) serialize packets.
+ The \verb|oseq| flag enables sequencing of outgoing packets.
+ The \verb|iseq| flag requires that all input packets are serialized.
+ The \verb|seq| flag is equivalent to the combination ``\verb|iseq| \verb|oseq|''.
+
+\begin{NB}
+ I think this option does not
+ work. At least, I did not test it, did not debug it and
+ do not even understand how it is supposed to work or for what
+ purpose Cisco planned to use it. Do not use it.
+\end{NB}
+
+
+\end{itemize}
+
+\paragraph{Example:} Create a pointopoint IPv6 tunnel with maximal TTL of 32.
+\begin{verbatim}
+netadm@amber:~ # ip tunl add Cisco mode sit remote 192.31.7.104 \
+ local 192.203.80.142 ttl 32
+\end{verbatim}
+
+\subsection{{\tt ip tunnel show} --- list tunnels}
+
+\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
+
+
+\paragraph{Arguments:} None.
+
+\paragraph{Output format:}
+\begin{verbatim}
+kuznet@amber:~ $ ip tunl ls Cisco
+Cisco: ipv6/ip remote 192.31.7.104 local 192.203.80.142 ttl 32
+kuznet@amber:~ $
+\end{verbatim}
+The line starts with the tunnel device name followed by a colon.
+Then the tunnel mode follows. The parameters of the tunnel are listed
+with the same keywords that were used when creating the tunnel.
+
+\paragraph{Statistics:}
+
+\begin{verbatim}
+kuznet@amber:~ $ ip -s tunl ls Cisco
+Cisco: ipv6/ip remote 192.31.7.104 local 192.203.80.142 ttl 32
+RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts
+ 12566 1707516 0 0 0 0
+TX: Packets Bytes Errors DeadLoop NoRoute NoBufs
+ 13445 1879677 0 0 0 0
+kuznet@amber:~ $
+\end{verbatim}
+Essentially, these numbers are the same as the numbers
+printed with {\tt ip -s link show}
+(sec.\ref{IP-LINK-SHOW}, p.\pageref{IP-LINK-SHOW}) but the tags are different
+to reflect that they are tunnel specific.
+\begin{itemize}
+\item \verb|CsumErrs| --- the total number of packets dropped
+because of checksum failures for a GRE tunnel with checksumming enabled.
+\item \verb|OutOfSeq| --- the total number of packets dropped
+because they arrived out of sequence for a GRE tunnel with
+serialization enabled.
+\item \verb|Mcasts| --- the total number of multicast packets
+received on a broadcast GRE tunnel.
+\item \verb|DeadLoop| --- the total number of packets which were not
+transmitted because the tunnel is looped back to itself.
+\item \verb|NoRoute| --- the total number of packets which were not
+transmitted because there is no IP route to the remote endpoint.
+\item \verb|NoBufs| --- the total number of packets which were not
+transmitted because the kernel failed to allocate a buffer.
+\end{itemize}
+
+
+\section{{\tt ip monitor} and {\tt rtmon} --- state monitoring}
+\label{IP-MONITOR}
+
+The \verb|ip| utility can monitor the state of devices, addresses
+and routes continuously. This option has a slightly different format.
+Namely,
+the \verb|monitor| command is the first in the command line and then
+the object list follows:
+\begin{verbatim}
+ ip monitor [ file FILE ] [ all | OBJECT-LIST ]
+\end{verbatim}
+\verb|OBJECT-LIST| is the list of object types that we want to monitor.
+It may contain \verb|link|, \verb|address| and \verb|route|.
+If no \verb|file| argument is given, \verb|ip| opens RTNETLINK,
+listens on it and dumps state changes in the format described
+in previous sections.
+
+If a file name is given, it does not listen on RTNETLINK,
+but opens the file containing RTNETLINK messages saved in binary format
+and dumps them. Such a history file can be generated with the
+\verb|rtmon| utility. This utility has a command line syntax similar to
+\verb|ip monitor|.
+Ideally, \verb|rtmon| should be started before
+the first network configuration command is issued. F.e.\ if
+you insert:
+\begin{verbatim}
+ rtmon file /var/log/rtmon.log
+\end{verbatim}
+in a startup script, you will be able to view the full history
+later.
+
+Certainly, it is possible to start \verb|rtmon| at any time.
+It prepends the history with the state snapshot dumped at the moment
+of starting.
+
+
+\section{Route realms and policy propagation, {\tt rtacct}}
+\label{RT-REALMS}
+
+On routers using OSPF ASE or, especially, the BGP protocol, routing
+tables may be huge. If we want to classify or to account for the packets
+per route, we will have to keep lots of information. Even worse, if we
+want to distinguish the packets not only by their destination, but
+also by their source, the task gets quadratic complexity and its solution
+is physically impossible.
+
+One approach to propagating the policy from routing protocols
+to the forwarding engine has been proposed in~\cite{IOS-BGP-PP}.
+Essentially, Cisco Policy Propagation via BGP is based on the fact
+that dedicated routers all have the RIB (Routing Information Base)
+close to the forwarding engine, so policy routing rules can
+check all the route attributes, including ASPATH information
+and community strings.
+
+The Linux architecture, splitting the RIB (maintained by a user level
+daemon) and the kernel based FIB (Forwarding Information Base),
+does not allow such a simple approach.
+
+It is to our fortune because there is another solution
+which allows even more flexible policy and richer semantics.
+
+Namely, routes can be clustered together in user space, based on their
+attributes. F.e.\ a BGP router knows route ASPATH, its community;
+an OSPF router knows the route tag or its area. The administrator, when adding
+routes manually, also knows their nature. Providing that the number of such
+aggregates (we call them {\em realms\/}) is low, the task of full
+classification both by source and destination becomes quite manageable.
+
+So each route may be assigned to a realm. It is assumed that
+this identification is made by a routing daemon, but static routes
+can also be handled manually with \verb|ip route| (see sec.\ref{IP-ROUTE},
+p.\pageref{IP-ROUTE}).
+\begin{NB}
+ There is a patch to \verb|gated|, allowing classification of routes
+ to realms with all the set of policy rules implemented in \verb|gated|:
+ by prefix, by ASPATH, by origin, by tag etc.
+\end{NB}
+
+To facilitate the construction (f.e.\ in case the routing
+daemon is not aware of realms), missing realms may be completed
+with routing policy rules, see sec.~\ref{IP-RULE}, p.\pageref{IP-RULE}.
+
+For each packet the kernel calculates a tuple of realms: source realm
+and destination realm, using the following algorithm:
+
+\begin{enumerate}
+\item If the route has a realm, the destination realm of the packet is set to it.
+\item If the rule has a source realm, the source realm of the packet is set to it.
+If the destination realm was not inherited from the route and the rule has a destination realm,
+it is also set.
+\item If at least one of the realms is still unknown, the kernel finds
+the reversed route to the source of the packet.
+\item If the source realm is still unknown, get it from the reversed route.
+\item If one of the realms is still unknown, swap the realms of reversed
+routes and apply step 2 again.
+\end{enumerate}
+
+After this procedure is completed we know what realm the packet
+arrived from and the realm where it is going to propagate to.
+If some of the realms are unknown, they are initialized to zero
+(or realm \verb|unknown|).
+
+The main application of realms is the TC \verb|route| classifier~\cite{TC-CREF},
+where they are used to help assign packets to traffic classes,
+to account, police and schedule them according to this
+classification.
+
+A much simpler but still very useful application is incoming packet
+accounting by realms. The kernel gathers a packet statistics summary
+which can be viewed with the \verb|rtacct| utility.
+\begin{verbatim}
+kuznet@amber:~ $ rtacct russia
+Realm BytesTo PktsTo BytesFrom PktsFrom
+russia 20576778 169176 47080168 153805
+kuznet@amber:~ $
+\end{verbatim}
+This shows that this router received 153805 packets from
+the realm \verb|russia| and forwarded 169176 packets to \verb|russia|.
+The realm \verb|russia| consists of routes with ASPATHs not leaving
+Russia.
+
+Note that locally originating packets are not accounted here,
+\verb|rtacct| shows incoming packets only. Using the \verb|route|
+classifier (see~\cite{TC-CREF}) you can get even more detailed
+accounting information about outgoing packets, optionally
+summarizing traffic not only by source or destination, but
+by any pair of source and destination realms.
+
+
+\begin{thebibliography}{99}
+\addcontentsline{toc}{section}{References}
+\bibitem{RFC-NDISC} T.~Narten, E.~Nordmark, W.~Simpson.
+``Neighbor Discovery for IP Version 6 (IPv6)'', RFC-2461.
+
+\bibitem{RFC-ADDRCONF} S.~Thomson, T.~Narten.
+``IPv6 Stateless Address Autoconfiguration'', RFC-2462.
+
+\bibitem{RFC1812} F.~Baker.
+``Requirements for IP Version 4 Routers'', RFC-1812.
+
+\bibitem{RFC1122} R.~T.~Braden.
+``Requirements for Internet hosts --- communication layers'', RFC-1122.
+
+\bibitem{IOS} ``Cisco IOS Release 12.0 Network Protocols
+Command Reference, Part 1'' and
+``Cisco IOS Release 12.0 Quality of Service Solutions
+Configuration Guide: Configuring Policy-Based Routing'',\\
+http://www.cisco.com/univercd/cc/td/doc/product/software/ios120.
+
+\bibitem{IP-TUNNELS} A.~N.~Kuznetsov.
+``Tunnels over IP in Linux-2.2'', \\
+In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}.
+
+\bibitem{TC-CREF} A.~N.~Kuznetsov. ``TC Command Reference'',\\
+In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}.
+
+\bibitem{IOS-BGP-PP} ``Cisco IOS Release 12.0 Quality of Service Solutions
+Configuration Guide: Configuring QoS Policy Propagation via
+Border Gateway Protocol'',\\
+http://www.cisco.com/univercd/cc/td/doc/product/software/ios120.
+
+\bibitem{RFC-DHCP} R.~Droms.
+``Dynamic Host Configuration Protocol.'', RFC-2131
+
+\end{thebibliography}
+
+
+
+
+\appendix
+\addcontentsline{toc}{section}{Appendix}
+
+\section{Source address selection}
+\label{ADDR-SEL}
+
+When a host creates an IP packet, it must select some source
+address. Correct source address selection is a critical procedure,
+because it gives the receiver the information needed to deliver a
+reply. If the source is selected incorrectly, in the best case,
+the backward path may appear different to the forward one which
+is harmful for performance. In the worst case, when the addresses
+are administratively scoped, the reply may be lost entirely.
+
+Linux-2.2 selects source addresses using the following algorithm:
+
+\begin{itemize}
+\item
+The application may select a source address explicitly with \verb|bind(2)|
+syscall or supplying it to \verb|sendmsg(2)| via the ancillary data object
+\verb|IP_PKTINFO|. In this case the kernel only checks the validity
+of the address and never tries to ``improve'' an incorrect user choice,
+generating an error instead.
+\begin{NB}
+ Never say ``Never''. The sysctl option \verb|ip_dynaddr| breaks
+ this axiom. It has been made deliberately with the purpose
+ of automatically reselecting the address on hosts with dynamic dial-out interfaces.
+ However, this hack {\em must not\/} be used on multihomed hosts
+ and especially on routers: it would break them.
+\end{NB}
+
+
+\item Otherwise, IP routing tables can contain an explicit source
+address hint for this destination. The hint is set with the \verb|src| parameter
+to the \verb|ip route| command, sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}.
+
+
+\item Otherwise, the kernel searches through the list of addresses
+attached to the interface through which the packets will be routed.
+The search strategies are different for IP and IPv6. Namely:
+
+\begin{itemize}
+\item IPv6 searches for the first valid, not deprecated address
+with the same scope as the destination.
+
+\item IP searches for the first valid address with a scope wider
+than the scope of the destination but it prefers addresses
+which fall to the same subnet as the nexthop of the route
+to the destination. Unlike IPv6, the scopes of IPv4 destinations
+are not encoded in their addresses but are supplied
+in routing tables instead (the \verb|scope| parameter to the \verb|ip route| command,
+sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}).
+
+\end{itemize}
+
+
+\item Otherwise, if the scope of the destination is \verb|link| or \verb|host|,
+the algorithm fails and returns a zero source address.
+
+\item Otherwise, all interfaces are scanned to search for an address
+with an appropriate scope. The loopback device \verb|lo| is always the first
+in the search list, so that if an address with global scope (not 127.0.0.1!)
+is configured on loopback, it is always preferred.
+
+\end{itemize}
+
+
+\section{Proxy ARP/NDISC}
+\label{PROXY-NEIGH}
+
+Routers may answer ARP/NDISC solicitations on behalf of other hosts.
+In Linux-2.2 proxy ARP on an interface may be enabled
+by setting the kernel \verb|sysctl| variable
+\verb|/proc/sys/net/ipv4/conf/<dev>/proxy_arp| to 1. After this, the router
+starts to answer ARP requests on the interface \verb|<dev>|, provided
+the route to the requested destination does {\em not\/} go back via the same
+device.
+
+The variable \verb|/proc/sys/net/ipv4/conf/all/proxy_arp| enables proxy
+ARP on all the IP devices.
+
+However, this approach fails in the case of IPv6 because the router
+must join the solicited node multicast address to listen for the corresponding
+NDISC queries. It means that proxy NDISC is possible only on a per destination
+basis.
+
+Logically, proxy ARP/NDISC is not a kernel task. It can easily be implemented
+in user space. However, similar functionality was present in BSD kernels
+and in Linux-2.0, so we have to preserve it at least to the extent that
+is standardized in BSD.
+\begin{NB}
+ Linux-2.0 ARP had a feature called {\em subnet\/} proxy ARP.
+ It is replaced with the sysctl flag in Linux-2.2.
+\end{NB}
+
+
+The \verb|ip| utility provides a way to manage proxy ARP/NDISC
+with the \verb|ip neigh| command, namely:
+\begin{verbatim}
+ ip neigh add proxy ADDRESS [ dev NAME ]
+\end{verbatim}
+adds a new proxy ARP/NDISC record and
+\begin{verbatim}
+ ip neigh del proxy ADDRESS [ dev NAME ]
+\end{verbatim}
+deletes it.
+
+If the name of the device is not given, the router will answer solicitations
+for address \verb|ADDRESS| on all devices, otherwise it will only serve
+the device \verb|NAME|. Even if the proxy entry is created with
+\verb|ip neigh|, the router {\em will not\/} answer a query if the route
+to the destination goes back via the interface from which the solicitation
+was received.
+
+It is important to emphasize that proxy entries have {\em no\/}
+parameters other than these (IP/IPv6 address and optional device).
+Particularly, the entry does not store any link layer address.
+It always advertises the station address of the interface
+on which it sends advertisements (i.e. it's own station address).
+
+\section{Route NAT status}
+\label{ROUTE-NAT}
+
+NAT (or ``Network Address Translation'') remaps some parts
+of the IP address space into other ones. Linux-2.2 route NAT is supposed
+to be used to facilitate policy routing by rewriting addresses
+to other routing domains or to help while renumbering sites
+to another prefix.
+
+\paragraph{What it is not:}
+It is necessary to emphasize that {\em it is not supposed\/}
+to be used to compress address space or to split load.
+This is not missing functionality but a design principle.
+Route NAT is {\em stateless\/}. It does not hold any state
+about translated sessions. This means that it handles any number
+of sessions flawlessly. But it also means that it is {\em static\/}.
+It cannot detect the moment when the last TCP client stops
+using an address. For the same reason, it will not help to split
+load between several servers.
+\begin{NB}
+It is a pretty commonly held belief that it is useful to split load between
+several servers with NAT. This is a mistake. All you get from this
+is the requirement that the router keep the state of all the TCP connections
+going via it. Well, if the router is so powerful, run apache on it. 8)
+\end{NB}
+
+The second feature: it does not touch packet payload,
+does not try to ``improve'' broken protocols by looking
+through its data and mangling it. It mangles IP addresses,
+only IP addresses and nothing but IP addresses.
+This also, is not missing any functionality.
+
+To resume: if you need to compress address space or keep
+active FTP clients happy, your choice is not route NAT but masquerading,
+port forwarding, NAPT etc.
+\begin{NB}
+By the way, you may also want to look at
+http://www.suse.com/\~mha/HyperNews/get/linux-ip-nat.html
+\end{NB}
+
+
+\paragraph{How it works.}
+Some part of the address space is reserved for dummy addresses
+which will look for all the world like some host addresses
+inside your network. No other hosts may use these addresses,
+however other routers may also be configured to translate them.
+\begin{NB}
+A great advantage of route NAT is that it may be used not
+only in stub networks but in environments with arbitrarily complicated
+structure. It does not firewall, it {\em forwards.}
+\end{NB}
+These addresses are selected by the \verb|ip route| command
+(sec.\ref{IP-ROUTE-ADD}, p.\pageref{IP-ROUTE-ADD}). F.e.\
+\begin{verbatim}
+ ip route add nat 192.203.80.144 via 193.233.7.83
+\end{verbatim}
+states that the single address 192.203.80.144 is a dummy NAT address.
+For all the world it looks like a host address inside our network.
+For neighbouring hosts and routers it looks like the local address
+of the translating router. The router answers ARP for it, advertises
+this address as routed via it, {\em et al\/}. When the router
+receives a packet destined for 192.203.80.144, it replaces
+this address with 193.233.7.83 which is the address of some real
+host and forwards the packet. If you need to remap
+blocks of addresses, you may use a command like:
+\begin{verbatim}
+ ip route add nat 192.203.80.192/26 via 193.233.7.64
+\end{verbatim}
+This command will map a block of 63 addresses 192.203.80.192-255 to
+193.233.7.64-127.
+
+When an internal host (193.233.7.83 in the example above)
+sends something to the outer world and these packets are forwarded
+by our router, it should translate the source address 193.233.7.83
+into 192.203.80.144. This task is solved by setting a special
+policy rule (sec.\ref{IP-RULE-ADD}, p.\pageref{IP-RULE-ADD}):
+\begin{verbatim}
+ ip rule add prio 320 from 193.233.7.83 nat 192.203.80.144
+\end{verbatim}
+This rule says that the source address 193.233.7.83
+should be translated into 192.203.80.144 before forwarding.
+It is important that the address after the \verb|nat| keyword
+is some NAT address, declared by {\tt ip route add nat}.
+If it is just a random address the router will not map to it.
+\begin{NB}
+The exception is when the address is a local address of this
+router (or 0.0.0.0) and masquerading is configured in the linux-2.2
+kernel. In this case the router will masquerade the packets as this address.
+If 0.0.0.0 is selected, the result is equivalent to one
+obtained with firewalling rules. Otherwise, you have the way
+to order Linux to masquerade to this fixed address.
+NAT mechanism used in linux-2.4 is more flexible than
+masquerading, so that this feature has lost meaning and disabled.
+\end{NB}
+
+If the network has non-trivial internal structure, it is
+useful and even necessary to add rules disabling translation
+when a packet does not leave this network. Let us return to the
+example from sec.\ref{IP-RULE-SHOW} (p.\pageref{IP-RULE-SHOW}).
+\begin{verbatim}
+300: from 193.233.7.83 to 193.233.7.0/24 lookup main
+310: from 193.233.7.83 to 192.203.80.0/24 lookup main
+320: from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144
+\end{verbatim}
+This block of rules causes normal forwarding when
+packets from 193.233.7.83 do not leave networks 193.233.7/24
+and 192.203.80/24. Also, if the \verb|inr.ruhep| table does not
+contain a route to the destination (which means that the routing
+domain owning addresses from 192.203.80/24 is dead), no translation
+will occur. Otherwise, the packets are translated.
+
+\paragraph{How to only translate selected ports:}
+If you only want to translate selected ports (f.e.\ http)
+and leave the rest intact, you may use \verb|ipchains|
+to \verb|fwmark| a class of packets.
+Suppose you did and all the packets from 193.233.7.83
+destined for port 80 are marked with marker 0x1234 in input fwchain.
+In this case you may replace rule \#320 with:
+\begin{verbatim}
+320: from 193.233.7.83 fwmark 1234 lookup main map-to 192.203.80.144
+\end{verbatim}
+and translation will only be enabled for outgoing http requests.
+
+\section{Example: minimal host setup}
+\label{EXAMPLE-SETUP}
+
+The following script gives an example of a fault safe
+setup of IP (and IPv6, if it is compiled into the kernel)
+in the common case of a node attached to a single broadcast
+network. A more advanced script, which may be used both on multihomed
+hosts and on routers, is described in the following
+section.
+
+The utilities used in the script may be found in the
+directory ftp://ftp.inr.ac.ru/ip-routing/:
+\begin{enumerate}
+\item \verb|ip| --- package \verb|iproute2|.
+\item \verb|arping| --- package \verb|iputils|.
+\item \verb|rdisc| --- package \verb|iputils|.
+\end{enumerate}
+\begin{NB}
+It also refers to a DHCP client, \verb|dhcpcd|. I should refrain from
+recommending a good DHCP client to use. All that I can
+say is that ISC \verb|dhcp-2.0b1pl6| patched with the patch that
+can be found in the \verb|dhcp.bootp.rarp| subdirectory of
+the same ftp site {\em does\/} work,
+at least on Ethernet and Token Ring.
+\end{NB}
+
+\begin{verbatim}
+#! /bin/bash
+\end{verbatim}
+\begin{flushleft}
+\# {\bf Usage: \verb|ifone ADDRESS[/PREFIX-LENGTH] [DEVICE]|}\\
+\# {\bf Parameters:}\\
+\# \$1 --- Static IP address, optionally followed by prefix length.\\
+\# \$2 --- Device name. If it is missing, \verb|eth0| is asssumed.\\
+\# F.e. \verb|ifone 193.233.7.90|
+\end{flushleft}
+\begin{verbatim}
+dev=$2
+: ${dev:=eth0}
+ipaddr=
+\end{verbatim}
+\# Parse IP address, splitting prefix length.
+\begin{verbatim}
+if [ "$1" != "" ]; then
+ ipaddr=${1%/*}
+ if [ "$1" != "$ipaddr" ]; then
+ pfxlen=${1#*/}
+ fi
+ : ${pfxlen:=24}
+fi
+pfx="${ipaddr}/${pfxlen}"
+\end{verbatim}
+
+\begin{flushleft}
+\# {\bf Step 0} --- enable loopback.\\
+\#\\
+\# This step is necessary on any networked box before attempt\\
+\# to configure any other device.\\
+\end{flushleft}
+\begin{verbatim}
+ip link set up dev lo
+ip addr add 127.0.0.1/8 dev lo brd + scope host
+\end{verbatim}
+\begin{flushleft}
+\# IPv6 autoconfigure themself on loopback.\\
+\#\\
+\# If user gave loopback as device, we add the address as alias and exit.
+\end{flushleft}
+\begin{verbatim}
+if [ "$dev" = "lo" ]; then
+ if [ "$ipaddr" != "" -a "$ipaddr" != "127.0.0.1" ]; then
+ ip address add $ipaddr dev $dev
+ exit $?
+ fi
+ exit 0
+fi
+\end{verbatim}
+
+\noindent\# {\bf Step 1} --- enable device \verb|$dev|
+
+\begin{verbatim}
+if ! ip link set up dev $dev ; then
+ echo "Cannot enable interface $dev. Aborting." 1>&2
+ exit 1
+fi
+\end{verbatim}
+\begin{flushleft}
+\# The interface is \verb|UP|. IPv6 started stateless autoconfiguration itself,\\
+\# and its configuration finishes here. However,\\
+\# IP still needs some static preconfigured address.
+\end{flushleft}
+\begin{verbatim}
+if [ "$ipaddr" = "" ]; then
+ echo "No address for $dev is configured, trying DHCP..." 1>&2
+ dhcpcd
+ exit $?
+fi
+\end{verbatim}
+
+\begin{flushleft}
+\# {\bf Step 2} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\
+\# Send two probes and wait for result for 3 seconds.\\
+\# If the interface opens slower f.e.\ due to long media detection,\\
+\# you want to increase the timeout.\\
+\end{flushleft}
+\begin{verbatim}
+if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then
+ echo "Address $ipaddr is busy, trying DHCP..." 1>&2
+ dhcpcd
+ exit $?
+fi
+\end{verbatim}
+\begin{flushleft}
+\# OK, the address is unique, we may add it on the interface.\\
+\#\\
+\# {\bf Step 3} --- Configure the address on the interface.
+\end{flushleft}
+
+\begin{verbatim}
+if ! ip address add $pfx brd + dev $dev; then
+ echo "Failed to add $pfx on $dev, trying DHCP..." 1>&2
+ dhcpcd
+ exit $?
+fi
+\end{verbatim}
+
+\noindent\# {\bf Step 4} --- Announce our presence on the link.
+\begin{verbatim}
+arping -A -c 1 -I $dev $ipaddr
+noarp=$?
+( sleep 2;
+ arping -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null &
+\end{verbatim}
+
+\begin{flushleft}
+\# {\bf Step 5} (optional) --- Add some control routes.\\
+\#\\
+\# 1. Prohibit link local multicast addresses.\\
+\# 2. Prohibit link local (alias, limited) broadcast.\\
+\# 3. Add default multicast route.
+\end{flushleft}
+\begin{verbatim}
+ip route add unreachable 224.0.0.0/24
+ip route add unreachable 255.255.255.255
+if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then
+ ip route add 224.0.0.0/4 dev $dev scope global
+fi
+\end{verbatim}
+
+\begin{flushleft}
+\# {\bf Step 6} --- Add fallback default route with huge metric.\\
+\# If a proxy ARP server is present on the interface, we will be\\
+\# able to talk to all the Internet without further configuration.\\
+\# It is not so cheap though and we still hope that this route\\
+\# will be overridden by more correct one by rdisc.\\
+\# Do not make this step if the device is not ARPable,\\
+\# because dead nexthop detection does not work on them.
+\end{flushleft}
+\begin{verbatim}
+if [ "$noarp" = "0" ]; then
+ ip ro add default dev $dev metric 30000 scope global
+fi
+\end{verbatim}
+
+\begin{flushleft}
+\# {\bf Step 7} --- Restart router discovery and exit.
+\end{flushleft}
+\begin{verbatim}
+killall -HUP rdisc || rdisc -fs
+exit 0
+\end{verbatim}
+
+
+\section{Example: {\protect\tt ifcfg} --- interface address management}
+\label{EXAMPLE-IFCFG}
+
+This is a simplistic script replacing one option of \verb|ifconfig|,
+namely, IP address management. It not only adds
+addresses, but also carries out Duplicate Address Detection~\cite{RFC-DHCP},
+sends unsolicited ARP to update the caches of other hosts sharing
+the interface, adds some control routes and restarts Router Discovery
+when it is necessary.
+
+I strongly recommend using it {\em instead\/} of \verb|ifconfig| both
+on hosts and on routers.
+
+\begin{verbatim}
+#! /bin/bash
+\end{verbatim}
+\begin{flushleft}
+\# {\bf Usage: \verb?ifcfg DEVICE[:ALIAS] [add|del] ADDRESS[/LENGTH] [PEER]?}\\
+\# {\bf Parameters:}\\
+\# ---Device name. It may have alias suffix, separated by colon.\\
+\# ---Command: add, delete or stop.\\
+\# ---IP address, optionally followed by prefix length.\\
+\# ---Optional peer address for pointopoint interfaces.\\
+\# F.e. \verb|ifcfg eth0 193.233.7.90/24|
+
+\noindent\# This function determines, whether it is router or host.\\
+\# It returns 0, if the host is apparently not router.
+\end{flushleft}
+\begin{verbatim}
+CheckForwarding () {
+ local sbase fwd
+ sbase=/proc/sys/net/ipv4/conf
+ fwd=0
+ if [ -d $sbase ]; then
+ for dir in $sbase/*/forwarding; do
+ fwd=$[$fwd + `cat $dir`]
+ done
+ else
+ fwd=2
+ fi
+ return $fwd
+}
+\end{verbatim}
+\begin{flushleft}
+\# This function restarts Router Discovery.\\
+\end{flushleft}
+\begin{verbatim}
+RestartRDISC () {
+ killall -HUP rdisc || rdisc -fs
+}
+\end{verbatim}
+\begin{flushleft}
+\# Calculate ABC "natural" mask length\\
+\# Arg: \$1 = dotquad address
+\end{flushleft}
+\begin{verbatim}
+ABCMaskLen () {
+ local class;
+ class=${1%%.*}
+ if [ $class -eq 0 -o $class -ge 224 ]; then return 0
+ elif [ $class -ge 192 ]; then return 24
+ elif [ $class -ge 128 ]; then return 16
+ else return 8 ; fi
+}
+\end{verbatim}
+
+
+\begin{flushleft}
+\# {\bf MAIN()}\\
+\#\\
+\# Strip alias suffix separated by colon.
+\end{flushleft}
+\begin{verbatim}
+label="label $1"
+ldev=$1
+dev=${1%:*}
+if [ "$dev" = "" -o "$1" = "help" ]; then
+ echo "Usage: ifcfg DEV [[add|del [ADDR[/LEN]] [PEER] | stop]" 1>&2
+ echo " add - add new address" 1>&2
+ echo " del - delete address" 1>&2
+ echo " stop - completely disable IP" 1>&2
+ exit 1
+fi
+shift
+
+CheckForwarding
+fwd=$?
+\end{verbatim}
+\begin{flushleft}
+\# Parse command. If it is ``stop'', flush and exit.
+\end{flushleft}
+\begin{verbatim}
+deleting=0
+case "$1" in
+add) shift ;;
+stop)
+ if [ "$ldev" != "$dev" ]; then
+ echo "Cannot stop alias $ldev" 1>&2
+ exit 1;
+ fi
+ ip -4 addr flush dev $dev $label || exit 1
+ if [ $fwd -eq 0 ]; then RestartRDISC; fi
+ exit 0 ;;
+del*)
+ deleting=1; shift ;;
+*)
+esac
+\end{verbatim}
+\begin{flushleft}
+\# Parse prefix, split prefix length, separated by slash.
+\end{flushleft}
+\begin{verbatim}
+ipaddr=
+pfxlen=
+if [ "$1" != "" ]; then
+ ipaddr=${1%/*}
+ if [ "$1" != "$ipaddr" ]; then
+ pfxlen=${1#*/}
+ fi
+ if [ "$ipaddr" = "" ]; then
+ echo "$1 is bad IP address." 1>&2
+ exit 1
+ fi
+fi
+shift
+\end{verbatim}
+\begin{flushleft}
+\# If peer address is present, prefix length is 32.\\
+\# Otherwise, if prefix length was not given, guess it.
+\end{flushleft}
+\begin{verbatim}
+peer=$1
+if [ "$peer" != "" ]; then
+ if [ "$pfxlen" != "" -a "$pfxlen" != "32" ]; then
+ echo "Peer address with non-trivial netmask." 1>&2
+ exit 1
+ fi
+ pfx="$ipaddr peer $peer"
+else
+ if [ "$pfxlen" = "" ]; then
+ ABCMaskLen $ipaddr
+ pfxlen=$?
+ fi
+ pfx="$ipaddr/$pfxlen"
+fi
+if [ "$ldev" = "$dev" -a "$ipaddr" != "" ]; then
+ label=
+fi
+\end{verbatim}
+\begin{flushleft}
+\# If deletion was requested, delete the address and restart RDISC
+\end{flushleft}
+\begin{verbatim}
+if [ $deleting -ne 0 ]; then
+ ip addr del $pfx dev $dev $label || exit 1
+ if [ $fwd -eq 0 ]; then RestartRDISC; fi
+ exit 0
+fi
+\end{verbatim}
+\begin{flushleft}
+\# Start interface initialization.\\
+\#\\
+\# {\bf Step 0} --- enable device \verb|$dev|
+\end{flushleft}
+\begin{verbatim}
+if ! ip link set up dev $dev ; then
+ echo "Error: cannot enable interface $dev." 1>&2
+ exit 1
+fi
+if [ "$ipaddr" = "" ]; then exit 0; fi
+\end{verbatim}
+\begin{flushleft}
+\# {\bf Step 1} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\
+\# Send two probes and wait for result for 3 seconds.\\
+\# If the interface opens slower f.e.\ due to long media detection,\\
+\# you want to increase the timeout.\\
+\end{flushleft}
+\begin{verbatim}
+if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then
+ echo "Error: some host already uses address $ipaddr on $dev." 1>&2
+ exit 1
+fi
+\end{verbatim}
+\begin{flushleft}
+\# OK, the address is unique. We may add it to the interface.\\
+\#\\
+\# {\bf Step 2} --- Configure the address on the interface.
+\end{flushleft}
+\begin{verbatim}
+if ! ip address add $pfx brd + dev $dev $label; then
+ echo "Error: failed to add $pfx on $dev." 1>&2
+ exit 1
+fi
+\end{verbatim}
+\noindent\# {\bf Step 3} --- Announce our presence on the link
+\begin{verbatim}
+arping -q -A -c 1 -I $dev $ipaddr
+noarp=$?
+( sleep 2 ;
+ arping -q -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null &
+\end{verbatim}
+\begin{flushleft}
+\# {\bf Step 4} (optional) --- Add some control routes.\\
+\#\\
+\# 1. Prohibit link local multicast addresses.\\
+\# 2. Prohibit link local (alias, limited) broadcast.\\
+\# 3. Add default multicast route.
+\end{flushleft}
+\begin{verbatim}
+ip route add unreachable 224.0.0.0/24 >& /dev/null
+ip route add unreachable 255.255.255.255 >& /dev/null
+if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then
+ ip route add 224.0.0.0/4 dev $dev scope global >& /dev/null
+fi
+\end{verbatim}
+\begin{flushleft}
+\# {\bf Step 5} --- Add fallback default route with huge metric.\\
+\# If a proxy ARP server is present on the interface, we will be\\
+\# able to talk to all the Internet without further configuration.\\
+\# Do not make this step on router or if the device is not ARPable.\\
+\# because dead nexthop detection does not work on them.
+\end{flushleft}
+\begin{verbatim}
+if [ $fwd -eq 0 ]; then
+ if [ $noarp -eq 0 ]; then
+ ip ro append default dev $dev metric 30000 scope global
+ elif [ "$peer" != "" ]; then
+ if ping -q -c 2 -w 4 $peer ; then
+ ip ro append default via $peer dev $dev metric 30001
+ fi
+ fi
+ RestartRDISC
+fi
+
+exit 0
+\end{verbatim}
+\begin{flushleft}
+\# End of {\bf MAIN()}
+\end{flushleft}
+
+
+\end{document}
diff --git a/doc/ip-tunnels.tex b/doc/ip-tunnels.tex
index e69de29b..0a8c930c 100644
--- a/doc/ip-tunnels.tex
+++ b/doc/ip-tunnels.tex
@@ -0,0 +1,469 @@
+\documentstyle[12pt,twoside]{article}
+\def\TITLE{Tunnels over IP}
+\input preamble
+\begin{center}
+\Large\bf Tunnels over IP in Linux-2.2
+\end{center}
+
+
+\begin{center}
+{ \large Alexey~N.~Kuznetsov } \\
+\em Institute for Nuclear Research, Moscow \\
+\verb|kuznet@ms2.inr.ac.ru| \\
+\rm March 17, 1999
+\end{center}
+
+\vspace{5mm}
+
+\tableofcontents
+
+
+\section{Instead of introduction: micro-FAQ.}
+
+\begin{itemize}
+
+\item
+Q: In linux-2.0.36 I used:
+\begin{verbatim}
+ ifconfig tunl1 10.0.0.1 pointopoint 193.233.7.65
+\end{verbatim}
+to create tunnel. It does not work in 2.2.0!
+
+A: You are right, it does not work. The command written above is split to two commands.
+\begin{verbatim}
+ ip tunnel add MY-TUNNEL mode ipip remote 193.233.7.65
+\end{verbatim}
+will create tunnel device with name \verb|MY-TUNNEL|. Now you may configure
+it with:
+\begin{verbatim}
+ ifconfig MY-TUNNEL 10.0.0.1
+\end{verbatim}
+Certainly, if you prefer name \verb|tunl1| to \verb|MY-TUNNEL|,
+you still may use it.
+
+\item
+Q: In linux-2.0.36 I used:
+\begin{verbatim}
+ ifconfig tunl0 10.0.0.1
+ route add -net 10.0.0.0 gw 193.233.7.65 dev tunl0
+\end{verbatim}
+to tunnel net 10.0.0.0 via router 193.233.7.65. It does not
+work in 2.2.0! Moreover, \verb|route| prints a funny error sort of
+``network unreachable'' and after this I found a strange direct route
+to 10.0.0.0 via \verb|tunl0| in routing table.
+
+A: Yes, in 2.2 the rule that {\em normal} gateway must reside on directly
+connected network has not any exceptions. You may tell kernel, that
+this particular route is {\em abnormal}:
+\begin{verbatim}
+ ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
+ ip route add 10.0.0.0/8 via 193.233.7.65 dev tunl0 onlink
+\end{verbatim}
+Note keyword \verb|onlink|, it is the magic key that orders kernel
+not to check for consistency of gateway address.
+Probably, after this explanation you have already guessed another method
+to cheat kernel:
+\begin{verbatim}
+ ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
+ route add -host 193.233.7.65 dev tunl0
+ route add -net 10.0.0.0 netmask 255.0.0.0 gw 193.233.7.65
+ route del -host 193.233.7.65 dev tunl0
+\end{verbatim}
+Well, if you like such tricks, nobody may prohibit you to use them.
+Only do not forget
+that between \verb|route add| and \verb|route del| host 193.233.7.65 is
+unreachable.
+
+\item
+Q: In 2.0.36 I used to load \verb|tunnel| device module and \verb|ipip| module.
+I cannot find any \verb|tunnel| in 2.2!
+
+A: Linux-2.2 has single module \verb|ipip| for both directions of tunneling
+and for all IPIP tunnel devices.
+
+\item
+Q: \verb|traceroute| does not work over tunnel! Well, stop... It works,
+ only skips some number of hops.
+
+A: Yes. By default tunnel driver copies \verb|ttl| value from
+inner packet to outer one. It means that path traversed by tunneled
+packets to another endpoint is not hidden. If you dislike this, or if you
+are going to use some routing protocol expecting that packets
+with ttl 1 will reach peering host (f.e.\ RIP, OSPF or EBGP)
+and you are not afraid of
+tunnel loops, you may append option \verb|ttl 64|, when creating tunnel
+with \verb|ip tunnel add|.
+
+\item
+Q: ... Well, list of things, which 2.0 was able to do finishes.
+
+\end{itemize}
+
+\paragraph{Summary of differences between 2.2 and 2.0.}
+
+\begin{itemize}
+
+\item {\bf In 2.0} you could compile tunnel device into kernel
+ and got set of 4 devices \verb|tunl0| ... \verb|tunl3| or,
+ alternatively, compile it as module and load new module
+ for each new tunnel. Also, module \verb|ipip| was necessary
+ to receive tunneled packets.
+
+ {\bf 2.2} has {\em one\/} module \verb|ipip|. Loading it you get base
+ tunnel device \verb|tunl0| and another tunnels may be created with command
+ \verb|ip tunnel add|. These new devices may have arbitrary names.
+
+
+\item {\bf In 2.0} you set remote tunnel endpoint address with
+ the command \verb|ifconfig| ... \verb|pointopoint A|.
+
+ {\bf In 2.2} this command has the same semantics on all
+ the interfaces, namely it sets not tunnel endpoint,
+ but address of peering host, which is directly reachable
+ via this tunnel,
+ rather than via Internet. Actual tunnel endpoint address \verb|A|
+ should be set with \verb|ip tunnel add ... remote A|.
+
+\item {\bf In 2.0} you create tunnel routes with the command:
+\begin{verbatim}
+ route add -net 10.0.0.0 gw A dev tunl0
+\end{verbatim}
+
+ {\bf 2.2} interprets this command equally for all device
+ kinds and gateway is required to be directly reachable via this tunnel,
+ rather than via Internet. You still may use \verb|ip route add ... onlink|
+ to override this behaviour.
+
+\end{itemize}
+
+
+\section{Tunnel setup: basics}
+
+Standard Linux-2.2 kernel supports three flavor of tunnels,
+listed in the following table:
+\vspace{2mm}
+
+\begin{tabular}{lll}
+\vrule depth 0.8ex width 0pt\relax
+Mode & Description & Base device \\
+ipip & IP over IP & tunl0 \\
+sit & IPv6 over IP & sit0 \\
+gre & ANY over GRE over IP & gre0
+\end{tabular}
+
+\vspace{2mm}
+
+\noindent All the kinds of tunnels are created with one command:
+\begin{verbatim}
+ ip tunnel add <NAME> mode <MODE> [ local <S> ] [ remote <D> ]
+\end{verbatim}
+
+This command creates new tunnel device with name \verb|<NAME>|.
+The \verb|<NAME>| is an arbitrary string. Particularly,
+it may be even \verb|eth0|. The rest of parameters set
+different tunnel characteristics.
+
+\begin{itemize}
+
+\item
+\verb|mode <MODE>| sets tunnel mode. Three modes are available now
+ \verb|ipip|, \verb|sit| and \verb|gre|.
+
+\item
+\verb|remote <D>| sets remote endpoint of the tunnel to IP
+ address \verb|<D>|.
+\item
+\verb|local <S>| sets fixed local address for tunneled
+ packets. It must be an address on another interface of this host.
+
+\end{itemize}
+
+\let\thefootnote\oldthefootnote
+
+Both \verb|remote| and \verb|local| may be omitted. In this case we
+say that they are zero or wildcard. Two tunnels of one mode cannot
+have the same \verb|remote| and \verb|local|. Particularly it means
+that base device or fallback tunnel cannot be replicated.\footnote{
+This restriction is relaxed for keyed GRE tunnels.}
+
+Tunnels are divided to two classes: {\bf pointopoint} tunnels, which
+have some not wildcard \verb|remote| address and deliver all the packets
+to this destination, and {\bf NBMA} (i.e. Non-Broadcast Multi-Access) tunnels,
+which have no \verb|remote|. Particularly, base devices (f.e.\ \verb|tunl0|)
+are NBMA, because they have neither \verb|remote| nor
+\verb|local| addresses.
+
+
+After tunnel device is created you should configure it as you did
+it with another devices. Certainly, the configuration of tunnels has
+some features related to the fact that they work over existing Internet
+routing infrastructure and simultaneously create new virtual links,
+which changes this infrastructure. The danger that not enough careful
+tunnel setup will result in formation of tunnel loops,
+collapse of routing or flooding network with exponentially
+growing number of tunneled fragments is very real.
+
+
+Protocol setup on pointopoint tunnels does not differ of configuration
+of another devices. You should set a protocol address with \verb|ifconfig|
+and add routes with \verb|route| utility.
+
+NBMA tunnels are different. To route something via NBMA tunnel
+you have to explain to driver, where it should deliver packets to.
+The only way to make it is to create special routes with gateway
+address pointing to desired endpoint. F.e.\
+\begin{verbatim}
+ ip route add 10.0.0.0/24 via <A> dev tunl0 onlink
+\end{verbatim}
+It is important to use option \verb|onlink|, otherwise
+kernel will refuse request to create route via gateway not directly
+reachable over device \verb|tunl0|. With IPv6 the situation is much simpler:
+when you start device \verb|sit0|, it automatically configures itself
+with all IPv4 addresses mapped to IPv6 space, so that all IPv4
+Internet is {\em really reachable} via \verb|sit0|! Excellent, the command
+\begin{verbatim}
+ ip route add 3FFE::/16 via ::193.233.7.65 dev sit0
+\end{verbatim}
+will route \verb|3FFE::/16| via \verb|sit0|, sending all the packets
+destined to this prefix to 193.233.7.65.
+
+\section{Tunnel setup: options}
+
+Command \verb|ip tunnel add| has several additional options.
+\begin{itemize}
+
+\item \verb|ttl N| --- set fixed TTL \verb|N| on tunneled packets.
+ \verb|N| is number in the range 1--255. 0 is special value,
+ meaning that packets inherit TTL value.
+ Default value is: \verb|inherit|.
+
+\item \verb|tos T| --- set fixed tos \verb|T| on tunneled packets.
+ Default value is: \verb|inherit|.
+
+\item \verb|dev DEV| --- bind tunnel to device \verb|DEV|, so that
+ tunneled packets will be routed only via this device and will
+ not be able to escape to another device, when route to endpoint changes.
+
+\item \verb|nopmtudisc| --- disable Path MTU Discovery on this tunnel.
+ It is enabled by default. Note that fixed ttl is incompatible
+ with this option: tunnels with fixed ttl always make pmtu discovery.
+
+\end{itemize}
+
+\verb|ipip| and \verb|sit| tunnels have no more options. \verb|gre|
+tunnels are more complicated:
+
+\begin{itemize}
+
+\item \verb|key K| --- use keyed GRE with key \verb|K|. \verb|K| is
+ either number or IP address-like dotted quad.
+
+\item \verb|csum| --- checksum tunneled packets.
+
+\item \verb|seq| --- serialize packets.
+\begin{NB}
+ I think this option does not
+ work. At least, I did not test it, did not debug it and
+ even do not understand, how it is supposed to work and for what
+ purpose Cisco planned to use it.
+\end{NB}
+
+\end{itemize}
+
+
+Actually, these GRE options can be set separately for input and
+output directions by prefixing corresponding keywords with letter
+\verb|i| or \verb|o|. F.e.\ \verb|icsum| orders to accept only
+packets with correct checksum and \verb|ocsum| means, that
+our host will calculate and send checksum.
+
+Command \verb|ip tunnel add| is not the only operation,
+which can be made with tunnels. Certainly, you may get short help page
+with:
+\begin{verbatim}
+ ip tunnel help
+\end{verbatim}
+
+Besides that, you may view list of installed tunnels with the help of command:
+\begin{verbatim}
+ ip tunnel ls
+\end{verbatim}
+Also you may look at statistics:
+\begin{verbatim}
+ ip -s tunnel ls Cisco
+\end{verbatim}
+where \verb|Cisco| is name of tunnel device. Command
+\begin{verbatim}
+ ip tunnel del Cisco
+\end{verbatim}
+destroys tunnel \verb|Cisco|. And, finally,
+\begin{verbatim}
+ ip tunnel change Cisco mode sit local ME remote HE ttl 32
+\end{verbatim}
+changes its parameters.
+
+\section{Differences 2.2 and 2.0 tunnels revisited.}
+
+Now we can discuss more subtle differences between tunneling in 2.0
+and 2.2.
+
+\begin{itemize}
+
+\item In 2.0 all tunneled packets were received promiscuously
+as soon as you loaded module \verb|ipip|. 2.2 tries to select the best
+tunnel device and packet looks as received on this. F.e.\ if host
+received \verb|ipip| packet from host \verb|D| destined to our
+local address \verb|S|, kernel searches for matching tunnels
+in order:
+
+\begin{tabular}{ll}
+1 & \verb|remote| is \verb|D| and \verb|local| is \verb|S| \\
+2 & \verb|remote| is \verb|D| and \verb|local| is wildcard \\
+3 & \verb|remote| is wildcard and \verb|local| is \verb|S| \\
+4 & \verb|tunl0|
+\end{tabular}
+
+If tunnel exists, but it is not in \verb|UP| state, the tunnel is ignored.
+Note, that if \verb|tunl0| is \verb|UP| it receives all the IPIP packets,
+not acknowledged by more specific tunnels.
+Be careful, it means that without carefully installed firewall rules
+anyone on the Internet may inject to your network any packets with
+source addresses indistinguishable from local ones. It is not so bad idea
+to design tunnels in the way enforcing maximal route symmetry
+and to enable reversed path filter (\verb|rp_filter| sysctl option) on
+tunnel devices.
+
+\item In 2.2 you can monitor and debug tunnels with \verb|tcpdump|.
+F.e.\ \verb|tcpdump| \verb|-i Cisco| \verb|-nvv| will dump packets,
+which kernel output, via tunnel \verb|Cisco| and the packets received on it
+from kernel viewpoint.
+
+\end{itemize}
+
+
+\section{Linux and Cisco IOS tunnels.}
+
+Among another tunnels Cisco IOS supports IPIP and GRE.
+Essentially, Cisco setup is subset of options, available for Linux.
+Let us consider the simplest example:
+
+\begin{verbatim}
+interface Tunnel0
+ tunnel mode gre ip
+ tunnel source 10.10.14.1
+ tunnel destination 10.10.13.2
+\end{verbatim}
+
+
+This command set translates to:
+
+\begin{verbatim}
+ ip tunnel add Tunnel0 \
+ mode gre \
+ local 10.10.14.1 \
+ remote 10.10.13.2
+\end{verbatim}
+
+Any questions? No questions.
+
+\section{Interaction IPIP tunnels and DVMRP.}
+
+DVMRP exploits IPIP tunnels to route multicasts via Internet.
+\verb|mrouted| creates
+IPIP tunnels listed in its configuration file automatically.
+From kernel and user viewpoints there are no differences between
+tunnels, created in this way, and tunnels created by \verb|ip tunnel|.
+I.e.\ if \verb|mrouted| created some tunnel, it may be used to
+route unicast packets, provided appropriate routes are added.
+And vice versa, if administrator has already created a tunnel,
+it will be reused by \verb|mrouted|, if it requests DVMRP
+tunnel with the same local and remote addresses.
+
+Do not wonder, if your manually configured tunnel is
+destroyed, when mrouted exits.
+
+
+\section{Broadcast GRE ``tunnels''.}
+
+It is possible to set \verb|remote| for GRE tunnel to a multicast
+address. Such tunnel becomes {\bf broadcast} tunnel (though word
+tunnel is not quite appropriate in this case, it is rather virtual network).
+\begin{verbatim}
+ ip tunnel add Universe local 193.233.7.65 \
+ remote 224.66.66.66 ttl 16
+ ip addr add 10.0.0.1/16 dev Universe
+ ip link set Universe up
+\end{verbatim}
+This tunnel is true broadcast network and broadcast packets are
+sent to multicast group 224.66.66.66. By default such tunnel starts
+to resolve both IP and IPv6 addresses via ARP/NDISC, so that
+if multicast routing is supported in surrounding network, all GRE nodes
+will find one another automatically and will form virtual Ethernet-like
+broadcast network. If multicast routing does not work, it is unpleasant
+but not fatal flaw. The tunnel becomes NBMA rather than broadcast network.
+You may disable dynamic ARPing by:
+\begin{verbatim}
+ echo 0 > /proc/sys/net/ipv4/neigh/Universe/mcast_solicit
+\end{verbatim}
+and to add required information to ARP tables manually:
+\begin{verbatim}
+ ip neigh add 10.0.0.2 lladdr 128.6.190.2 dev Universe nud permanent
+\end{verbatim}
+In this case packets sent to 10.0.0.2 will be encapsulated in GRE
+and sent to 128.6.190.2. It is possible to facilitate address resolution
+using methods typical for another NBMA networks f.e.\ to start user
+level \verb|arpd| daemon, which will maintain database of hosts attached
+to GRE virtual network or ask for information
+dedicated ARP or NHRP server.
+
+
+Actually, such setup is the most natural for tunneling,
+it is really flexible, scalable and easily managable, so that
+it is strongly recommended to be used with GRE tunnels instead of ugly
+hack with NBMA mode and \verb|onlink| modifier. Unfortunately,
+by historical reasons broadcast mode is not supported by IPIP tunnels,
+but this probably will change in future.
+
+
+
+\section{Traffic control issues.}
+
+Tunnels are devices, hence all the power of Linux traffic control
+applies to them. The simplest (and the most useful in practice)
+example is limiting tunnel bandwidth. The following command:
+\begin{verbatim}
+ tc qdisc add dev tunl0 root tbf \
+ rate 128Kbit burst 4K limit 10K
+\end{verbatim}
+will limit tunneled traffic to 128Kbit with maximal burst size of 4K
+and queuing not more than 10K.
+
+However, you should remember, that tunnels are {\em virtual} devices
+implemented in software and true queue management is impossible for them
+just because they have no queues. Instead, it is better to create classes
+on real physical interfaces and to map tunneled packets to them.
+In general case of dynamic routing you should create such classes
+on all outgoing interfaces, or, alternatively,
+to use option \verb|dev DEV| to bind tunnel to a fixed physical device.
+In the last case packets will be routed only via specified device
+and you need to setup corresponding classes only on it.
+Though you have to pay for this convenience,
+if routing will change, your tunnel will fail.
+
+Suppose that CBQ class \verb|1:ABC| has been created on device \verb|eth0|
+specially for tunnel \verb|Cisco| with endpoints \verb|S| and \verb|D|.
+Now you can select IPIP packets with addresses \verb|S| and \verb|D|
+with some classifier and map them to class \verb|1:ABC|. F.e.\
+it is easy to make with \verb|rsvp| classifier:
+\begin{verbatim}
+ tc filter add dev eth0 pref 100 proto ip rsvp \
+ session D ipproto ipip filter S \
+ classid 1:ABC
+\end{verbatim}
+
+If you want to make more detailed classification of sub-flows
+transmitted via tunnel, you can build CBQ subtree,
+rooted at \verb|1:ABC| and attach to subroot set of rules parsing
+IPIP packets more deeply.
+
+\end{document}
diff --git a/doc/nstat.sgml b/doc/nstat.sgml
index e69de29b..be9d8bcc 100644
--- a/doc/nstat.sgml
+++ b/doc/nstat.sgml
@@ -0,0 +1,110 @@
+<!doctype linuxdoc system>
+
+<article>
+
+<title>NSTAT, IFSTAT and RTACCT Utilities
+<author>Alexey Kuznetosv, <tt/kuznet@ms2.inr.ac.ru/
+<date>some_negative_number, 20 Sep 2001
+<abstract>
+<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping
+to monitor kernel snmp counters and network interface statistics.
+</abstract>
+
+<p> These utilities are very similar, so that I describe
+them simultaneously, using name <tt/Xstat/ in the places which apply
+to all of them.
+
+<p>The format of the command is:
+
+<tscreen><verb>
+ Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ]
+</verb></tscreen>
+
+<p>
+<tt/PATTERN/ is shell style pattern, selecting identifier
+of SNMP variables or interfaces to show. Variable is displayed
+if one of patterns matches its name. If no patterns are given,
+<tt/Xstat/ assumes that user wants to see all the variables.
+
+<p> <tt/OPTIONS/ is list of single letter options, using common unix
+conventions.
+
+<itemize>
+<item><tt/-h/ - show help page
+<item><tt/-?/ - the same, of course
+<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit
+<item><tt/-z/ - dump zero counters too. By default they are not shown.
+<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/
+ calculates increments since the previous use.
+<item><tt/-s/ - do not update history, so that the next time you will
+ see counters including values accumulated to the moment
+ of this measurement too.
+<item><tt/-n/ - do not display anything, only update history.
+<item><tt/-r/ - reset history.
+<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting
+ statistics. <tt/INTERVAL/ is interval between measurements
+ in seconds.
+<item><tt/-t INTERVAL/ - time interval to average rates. Default value
+ is 60 seconds.
+<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only).
+</itemize>
+
+<p>
+History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt>
+or in file given by environment variables <tt/NSTAT_HISTORY/,
+<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/.
+Each time when you use <tt/Xstat/ values there are updated.
+If you use patterns, only the values which you _really_ see
+are updated. If you want to skip an unintersting period,
+use option <tt/-n/, or just output to <tt>/dev/null</tt>.
+
+<p>
+<tt/Xstat/ understands when history is invalidated by system reboot
+or source of information switched between different instances
+of daemonic <tt/Xstat/ and kernel SNMP tables and does not
+use invalid history.
+
+<p> Beware, <tt/Xstat/ will not produce sane output,
+when many processes use it simultaneously. If several processes
+under single user need this utility they should use environment
+variables to put their history in safe places
+or to use it with options <tt/-a -s/.
+
+<p>
+Well, that's all. The utility is very simple, but nevertheless
+very handy.
+
+<p> <bf/Output of XSTAT/
+<p> The first line of output is <tt/#/ followed by identifier
+of source of information, it may be word <tt/kernel/, when <tt/Xstat/
+gets information from kernel or some dotted decimal number followed
+by parameters, when it obtains information from running <tt/Xstat/ daemon.
+
+<p>In the case of <tt/nstat/ the rest of output consists of three columns:
+SNMP MIB identifier,
+its value (or increment since previous measurement) and average
+rate of increase of the counter per second. <tt/ifstat/ outputs
+interface name followed by pairs of counter and rate of its change.
+
+<p> <bf/Daemonic Xstat/
+<p> <tt/Xstat/ may be started as daemon by any user. This makes sense
+to avoid wrapped counters and to obtain reasonable long counters
+for large time. Also <tt/Xstat/ daemon calculates average rates.
+For the first goal sampling interval (option <tt/-d/) may be large enough,
+f.e. for gigabit rates byte counters overflow not more frequently than
+each 40 seconds and you may select interval of 20 seconds.
+From the other hand, when <tt/Xstat/ is used for estimating rates
+interval should be less than averaging period (option <tt/-t/), otherwise
+estimation loses in quality.
+
+Client <tt/Xstat/, before trying to get information from the kernel,
+contacts daemon started by this user, then it tries system wide
+daemon, which is supposed to be started by superuser. And only if
+none of them replied it gets information from kernel.
+
+<p> <bf/Environment/
+<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/.
+<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/.
+<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/.
+
+</article>
diff --git a/doc/preamble.tex b/doc/preamble.tex
index e69de29b..80ca5087 100644
--- a/doc/preamble.tex
+++ b/doc/preamble.tex
@@ -0,0 +1,26 @@
+\textwidth 6.0in
+\textheight 8.5in
+
+\input SNAPSHOT
+
+\pagestyle{myheadings}
+\markboth{\protect\TITLE}{}
+\markright{{\protect\sc iproute2-ss\Draft}}
+
+% To print it in compact form: both sides on one sheet (psnup -2)
+\evensidemargin=\oddsidemargin
+
+\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB.
+}{\par\egroup \vskip 1mm}
+
+\def\threeonly{[2.3.15+ only] }
+
+\begin{document}
+
+\makeatletter
+\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}}
+\makeatother
+\let\oldthefootnote\thefootnote
+\def\thefootnote{}
+\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov}
+
diff --git a/doc/rtstat.sgml b/doc/rtstat.sgml
index e69de29b..07391c39 100644
--- a/doc/rtstat.sgml
+++ b/doc/rtstat.sgml
@@ -0,0 +1,52 @@
+<!doctype linuxdoc system>
+
+<article>
+
+<title>RTACCT Utility
+<author>Robert Olsson
+<date>some_negative_number, 20 Dec 2001
+
+<p>
+Here is some code for monitoring the route cache. For systems handling high
+network load, servers, routers, firewalls etc the route cache and its garbage
+collection is crucial. Linux has a solid implementation.
+
+<p>
+The kernel patch (not required since linux-2.4.7) adds statistics counters
+from route cache process into
+/proc/net/rt_cache_stat. A companion user mode program presents the statistics
+in a vmstat or iostat manner. The ratio between cache hits and misses gives
+the flow length.
+
+<p>
+Hopefully it can help understanding performance and DoS and other related
+issues.
+
+<p> An URL where newer versions of this utility can be (probably) found
+is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/
+
+
+<p><bf/Description/
+
+<p>The format of the command is:
+
+<tscreen><verb>
+ rtstat [ OPTIONS ]
+</verb></tscreen>
+
+<p> <tt/OPTIONS/ are:
+
+<itemize>
+
+<item><tt/-h/, <tt/-help/ - show help page and version of the utility.
+
+<item><tt/-i INTERVAL/ - interval between snapshots, default value is
+2 seconds.
+
+<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line,
+1 prescribes to print it once and 2 (this is default setting) forces header
+line each 20 lines.
+
+</itemize>
+
+</article>
diff --git a/doc/ss.sgml b/doc/ss.sgml
index e69de29b..0b1b5335 100644
--- a/doc/ss.sgml
+++ b/doc/ss.sgml
@@ -0,0 +1,525 @@
+<!doctype linuxdoc system>
+
+<article>
+
+<title>SS Utility: Quick Intro
+<author>Alexey Kuznetosv, <tt/kuznet@ms2.inr.ac.ru/
+<date>some_negative_number, 20 Sep 2001
+<abstract>
+<tt/ss/ is one another utility to investigate sockets.
+Functionally it is NOT better than <tt/netstat/ combined
+with some perl/awk scripts and though it is surely faster
+it is not enough to make it much better. :-)
+So, stop reading this now and do not waste your time.
+Well, certainly, it proposes some functionality, which current
+netstat is still not able to do, but surely will soon.
+</abstract>
+
+<sect>Why?
+
+<p> <tt>/proc</tt> interface is inadequate, unfortunately.
+When amount of sockets is enough large, <tt/netstat/ or even
+plain <tt>cat /proc/net/tcp/</tt> cause nothing but pains and curses.
+In linux-2.4 the desease became worse: even if amount
+of sockets is small reading <tt>/proc/net/tcp/</tt> is slow enough.
+
+This utility presents a new approach, which is supposed to scale
+well. I am not going to describe technical details here and
+will concentrate on description of the command.
+The only important thing to say is that it is not so bad idea
+to load module <tt/tcp_diag/, which can be found in directory
+<tt/Modules/ of <tt/iproute2/. If you do not make this <tt/ss/
+will work, but it falls back to <tt>/proc</tt> and becomes slow
+like <tt/netstat/, well, a bit faster yet (see section "Some numbers").
+
+<sect>Old news
+
+<p>
+In the simplest form <tt/ss/ is equivalent to netstat
+with some small deviations.
+
+<itemize>
+<item><tt/ss -t -a/ dumps all TCP sockets
+<item><tt/ss -u -a/ dumps all UDP sockets
+<item><tt/ss -w -a/ dumps all RAW sockets
+<item><tt/ss -x -a/ dumps all UNIX sockets
+</itemize>
+
+<p>
+Option <tt/-o/ shows TCP timers state.
+Option <tt/-e/ shows some extended information.
+Etc. etc. etc. Seems, all the options of netstat related to sockets
+are supported. Though not AX.25 and other bizarres. :-)
+If someone wants, he can make support for decnet and ipx.
+Some rudimentary support for them is already present in iproute2 libutils,
+and I will be glad to see these new members.
+
+<p>
+However, standard functionality is a bit different:
+
+<p>
+The first: without option <tt/-a/ sockets in states
+<tt/TIME-WAIT/ and <tt/SYN-RECV/ are skipped too.
+It is more reasonable default, I think.
+
+<p>
+The second: format of UNIX sockets is different. It coincides
+with tcp/udp. Though standard kernel still does not allow to
+see write/read queues and peer address of connected UNIX sockets,
+the patch doing this exists.
+
+<p>
+The third: default is to dump only TCP sockets, rather than all of the types.
+
+<p>
+The next: by default it does not resolve numeric host addresses (like <tt/ip/)!
+Resolving is enabled with option <tt/-r/. Service names, usually stored
+in local files, are resolved by default. Also, if service database
+does not contain references to a port, <tt/ss/ queries system
+<tt/rpcbind/. RPC services are prefixed with <tt/rpc./
+Resolution of services may be suppressed with option <tt/-n/.
+
+<p>
+It does not accept "long" options (I dislike them, sorry).
+So, address family is given with family identifier following
+option <tt/-f/ to be algined to iproute2 conventions.
+Mostly, it is to allow option parser to parse
+addresses correctly, but as side effect it really limits dumping
+to sockets supporting only given family. Option <tt/-A/ followed
+by list of socket tables to dump is also supported.
+Logically, id of socket table is different of _address_ family, which is
+another point of incompatibility. So, id is one of
+<tt/all/, <tt/tcp/, <tt/udp/,
+<tt/raw/, <tt/inet/, <tt/unix/, <tt/packet/, <tt/netlink/. See?
+Well, <tt/inet/ is just abbreviation for <tt/tcp|udp|raw/
+and it is not difficult to guess that <tt/packet/ allows
+to look at packet sockets. Actually, there are also some other abbreviations,
+f.e. <tt/unix_dgram/ selects only datagram UNIX sockets.
+
+<p>
+The next: well, I still do not know. :-)
+
+
+
+
+<sect>Time to talk about new functionality.
+
+<p>It is builtin filtering of socket lists.
+
+<sect1> Filtering by state.
+
+<p>
+<tt/ss/ allows to filter socket states, using keywords
+<tt/state/ and <tt/exclude/, followed by some state
+identifier.
+
+<p>
+State identifier are standard TCP state names (not listed,
+they are useless for you if you already do not know them)
+or abbreviations:
+
+<itemize>
+<item><tt/all/ - for all the states
+<item><tt/bucket/ - for TCP minisockets (<tt/TIME-WAIT|SYN-RECV/)
+<item><tt/big/ - all except for minisockets
+<item><tt/connected/ - not closed and not listening
+<item><tt/synchronized/ - connected and not <tt/SYN-SENT/
+</itemize>
+
+<p>
+ F.e. to dump all tcp sockets except <tt/SYN-RECV/:
+
+<tscreen><verb>
+ ss exclude SYN-RECV
+</verb></tscreen>
+
+<p>
+ If neither <tt/state/ nor <tt/exclude/ directives
+ are present,
+ state filter defaults to <tt/all/ with option <tt/-a/
+ or to <tt/all/,
+ excluding listening, syn-recv, time-wait and closed sockets.
+
+<sect1> Filtering by addresses and ports.
+
+<p>
+Option list may contain address/port filter.
+It is boolean expression which consists of boolean operation
+<tt/or/, <tt/and/, <tt/not/ and predicates.
+Actually, all the flavors of names for boolean operations are eaten:
+<tt/&amp/, <tt/&amp&amp/, <tt/|/, <tt/||/, <tt/!/, but do not forget
+about special sense given to these symbols by unix shells and escape
+them correctly, when used from command line.
+
+<p>
+Predicates may be of the folowing kinds:
+
+<itemize>
+<item>A. Address/port match, where address is checked against mask
+ and port is either wildcard or exact. It is one of:
+
+<tscreen><verb>
+ dst prefix:port
+ src prefix:port
+ src unix:STRING
+ src link:protocol:ifindex
+ src nl:channel:pid
+</verb></tscreen>
+
+ Both prefix and port may be absent or replaced with <tt/*/,
+ which means wildcard. UNIX socket use more powerful scheme
+ matching to socket names by shell wildcards. Also, prefixes
+ unix: and link: may be omitted, if address family is evident
+ from context (with option <tt/-x/ or with <tt/-f unix/
+ or with <tt/unix/ keyword)
+
+<p>
+ F.e.
+
+<tscreen><verb>
+ dst 10.0.0.1
+ dst 10.0.0.1:
+ dst 10.0.0.1/32:
+ dst 10.0.0.1:*
+</verb></tscreen>
+ are equivalent and mean socket connected to
+ any port on host 10.0.0.1
+
+<tscreen><verb>
+ dst 10.0.0.0/24:22
+</verb></tscreen>
+ sockets connected to port 22 on network
+ 10.0.0.0...255.
+
+<p>
+ Note that port separated of address with colon, which creates
+ troubles with IPv6 addresses. Generally, we interpret the last
+ colon as splitting port. To allow to give IPv6 addresses,
+ trick like used in IPv6 HTTP URLs may be used:
+
+<tscreen><verb>
+ dst [::1]
+</verb></tscreen>
+ are sockets connected to ::1 on any port
+
+<p>
+ Another way is <tt/dst ::1/128/. / helps to understand that
+ colon is part of IPv6 address.
+
+<p>
+ Now we can add another alias for <tt/dst 10.0.0.1/:
+ <tt/dst [10.0.0.1]/. :-)
+
+<p> Address may be a DNS name. In this case all the addresses are looked
+ up (in all the address families, if it is not limited by option <tt/-f/
+ or special address prefix <tt/inet:/, <tt/inet6/) and resulting
+ expression is <tt/or/ over all of them.
+
+<item> B. Port expressions:
+<tscreen><verb>
+ dport &gt= :1024
+ dport != :22
+ sport &lt :32000
+</verb></tscreen>
+ etc.
+
+ All the relations: <tt/&lt/, <tt/&gt/, <tt/=/, <tt/>=/, <tt/=/, <tt/==/,
+ <tt/!=/, <tt/eq/, <tt/ge/, <tt/lt/, <tt/ne/...
+ Use variant which you like more, but not forget to escape special
+ characters when typing them in command line. :-)
+
+ Note that port number syntactically coincides to the case A!
+ You may even add an IP address, but it will not participate
+ incomparison, except for <tt/==/ and <tt/!=/, which are equivalent
+ to corresponding predicates of type A. F.e.
+<p>
+<tt/dst 10.0.0.1:22/
+ is equivalent to <tt/dport eq 10.0.0.1:22/
+ and
+ <tt/not dst 10.0.0.1:22/ is equivalent to
+ <tt/dport neq 10.0.0.1:22/
+
+<item>C. Keyword <tt/autobound/. It matches to sockets bound automatically
+ on local system.
+
+</itemize>
+
+
+<sect> Examples
+
+<p>
+<itemize>
+<item>1. List all the tcp sockets in state <tt/FIN-WAIT-1/ for our apache
+ to network 193.233.7/24 and look at their timers:
+
+<tscreen><verb>
+ ss -o state fin-wait-1 \( sport = :http or sport = :https \) \
+ dst 193.233.7/24
+</verb></tscreen>
+
+ Oops, forgot to say that missing logical operation is
+ equivalent to <tt/and/.
+
+<item> 2. Well, now look at the rest...
+
+<tscreen><verb>
+ ss -o excl fin-wait-1
+ ss state fin-wait-1 \( sport neq :http and sport neq :https \) \
+ or not dst 193.233.7/24
+</verb></tscreen>
+
+ Note that we have to do _two_ calls of ss to do this.
+ State match is always anded to address/port match.
+ The reason for this is purely technical: ss does fast skip of
+ not matching states before parsing addresses and I consider the
+ ability to skip fastly gobs of time-wait and syn-recv sockets
+ as more important than logical generality.
+
+<item> 3. So, let's look at all our sockets using autobound ports:
+
+<tscreen><verb>
+ ss -a -A all autobound
+</verb></tscreen>
+
+
+<item> 4. And eventually find all the local processes connected
+ to local X servers:
+
+<tscreen><verb>
+ ss -xp dst "/tmp/.X11-unix/*"
+</verb></tscreen>
+
+ Pardon, this does not work with current kernel, patching is required.
+ But we still can look at server side:
+
+<tscreen><verb>
+ ss -x src "/tmp/.X11-unix/*"
+</verb></tscreen>
+
+</itemize>
+
+
+<sect> Returning to ground: real manual
+
+<p>
+<sect1> Command arguments
+
+<p> General format of arguments to <tt/ss/ is:
+
+<tscreen><verb>
+ ss [ OPTIONS ] [ STATE-FILTER ] [ ADDRESS-FILTER ]
+</verb></tscreen>
+
+<sect2><tt/OPTIONS/
+<p> <tt/OPTIONS/ is list of single letter options, using common unix
+conventions.
+
+<itemize>
+<item><tt/-h/ - show help page
+<item><tt/-?/ - the same, of course
+<item><tt/-v/, <tt/-V/ - print version of <tt/ss/ and exit
+<item><tt/-s/ - print summary statistics. This option does not parse
+socket lists obtaining summary from various sources. It is useful
+when amount of sockets is so huge that parsing <tt>/proc/net/tcp</tt>
+is painful.
+<item><tt/-D FILE/ - do not display anything, just dump raw information
+about TCP sockets to <tt/FILE/ after applying filters. If <tt/FILE/ is <tt/-/
+<tt/stdout/ is used.
+<item><tt/-F FILE/ - read continuation of filter from <tt/FILE/.
+Each line of <tt/FILE/ is interpreted like single command line option.
+If <tt/FILE/ is <tt/-/ <tt/stdin/ is used.
+<item><tt/-r/ - try to resolve numeric address/ports
+<item><tt/-n/ - do not try to resolve ports
+<item><tt/-o/ - show some optional information, f.e. TCP timers
+<item><tt/-i/ - show some infomration specific to TCP (RTO, congestion
+window, slow start threshould etc.)
+<item><tt/-e/ - show even more optional information
+<item><tt/-m/ - show extended information on memory used by the socket.
+It is available only with <tt/tcp_diag/ enabled.
+<item><tt/-p/ - show list of processes owning the socket
+<item><tt/-f FAMILY/ - default address family used for parsing addresses.
+ Also this option limits listing to sockets supporting
+ given address family. Currently the following families
+ are supported: <tt/unix/, <tt/inet/, <tt/inet6/, <tt/link/,
+ <tt/netlink/.
+<item><tt/-4/ - alias for <tt/-f inet/
+<item><tt/-6/ - alias for <tt/-f inet6/
+<item><tt/-0/ - alias for <tt/-f link/
+<item><tt/-A LIST-OF-TABLES/ - list of socket tables to dump, separated
+ by commas. The following identifiers are understood:
+ <tt/all/, <tt/inet/, <tt/tcp/, <tt/udp/, <tt/raw/,
+ <tt/unix/, <tt/packet/, <tt/netlink/, <tt/unix_dgram/,
+ <tt/unix_stream/, <tt/packet_raw/, <tt/packet_dgram/.
+<item><tt/-x/ - alias for <tt/-A unix/
+<item><tt/-t/ - alias for <tt/-A tcp/
+<item><tt/-u/ - alias for <tt/-A udp/
+<item><tt/-w/ - alias for <tt/-A raw/
+<item><tt/-a/ - show sockets of all the states. By default sockets
+ in states <tt/LISTEN/, <tt/TIME-WAIT/, <tt/SYN_RECV/
+ and <tt/CLOSE/ are skipped.
+<item><tt/-l/ - show only sockets in state <tt/LISTEN/
+</itemize>
+
+<sect2><tt/STATE-FILTER/
+
+<p><tt/STATE-FILTER/ allows to construct arbitrary set of
+states to match. Its syntax is sequence of keywords <tt/state/
+and <tt/exclude/ followed by identifier of state.
+Available identifiers are:
+
+<p>
+<itemize>
+<item> All standard TCP states: <tt/established/, <tt/syn-sent/,
+<tt/syn-recv/, <tt/fin-wait-1/, <tt/fin-wait-2/, <tt/time-wait/,
+<tt/closed/, <tt/close-wait/, <tt/last-ack/, <tt/listen/ and <tt/closing/.
+
+<item><tt/all/ - for all the states
+<item><tt/connected/ - all the states except for <tt/listen/ and <tt/closed/
+<item><tt/synchronized/ - all the <tt/connected/ states except for
+<tt/syn-sent/
+<item><tt/bucket/ - states, which are maintained as minisockets, i.e.
+<tt/time-wait/ and <tt/syn-recv/.
+<item><tt/big/ - opposite to <tt/bucket/
+</itemize>
+
+<sect2><tt/ADDRESS_FILTER/
+
+<p><tt/ADDRESS_FILTER/ is boolean expression with operations <tt/and/, <tt/or/
+and <tt/not/, which can be abbreviated in C style f.e. as <tt/&amp/,
+<tt/&amp&amp/.
+
+<p>
+Predicates check socket addresses, both local and remote.
+There are the following kinds of predicates:
+
+<itemize>
+<item> <tt/dst ADDRESS_PATTERN/ - matches remote address and port
+<item> <tt/src ADDRESS_PATTERN/ - matches local address and port
+<item> <tt/dport RELOP PORT/ - compares remote port to a number
+<item> <tt/sport RELOP PORT/ - compares local port to a number
+<item> <tt/autobound/ - checks that socket is bound to an ephemeral
+ port
+</itemize>
+
+<p><tt/RELOP/ is some of <tt/&lt=/, <tt/&gt=/, <tt/==/ etc.
+To make this more convinient for use in unix shell, alphabetic
+FORTRAN-like notations <tt/le/, <tt/gt/ etc. are accepted as well.
+
+<p>The format and semantics of <tt/ADDRESS_PATTERN/ depends on address
+family.
+
+<itemize>
+<item><tt/inet/ - <tt/ADDRESS_PATTERN/ consists of IP prefix, optionally
+followed by colon and port. If prefix or port part is absent or replaced
+with <tt/*/, this means wildcard match.
+<item><tt/inet6/ - The same as <tt/inet/, only prefix refers to an IPv6
+address. Unlike <tt/inet/ colon becomes ambiguous, so that <tt/ss/ allows
+to use scheme, like used in URLs, where address is suppounded with
+<tt/[/ ... <tt/]/.
+<item><tt/unix/ - <tt/ADDRESS_PATTERN/ is shell-style wildcard.
+<item><tt/packet/ - format looks like <tt/inet/, only interface index
+stays instead of port and link layer protocol id instead of address.
+<item><tt/netlink/ - format looks like <tt/inet/, only socket pid
+stays instead of port and netlink channel instead of address.
+</itemize>
+
+<p><tt/PORT/ is syntactically <tt/ADDRESS_PATTERN/ with wildcard
+address part. Certainly, it is undefined for UNIX sockets.
+
+<sect1> Environment variables
+
+<p>
+<tt/ss/ allows to change source of information using various
+environment variables:
+
+<p>
+<itemize>
+<item> <tt/PROC_SLABINFO/ to override <tt>/proc/slabinfo</tt>
+<item> <tt/PROC_NET_TCP/ to override <tt>/proc/net/tcp</tt>
+<item> <tt/PROC_NET_UDP/ to override <tt>/proc/net/udp</tt>
+<item> etc.
+</itemize>
+
+<p>
+Variable <tt/PROC_ROOT/ allows to change root of all the <tt>/proc/</tt>
+hierarchy.
+
+<p>
+Variable <tt/TCPDIAG_FILE/ prescribes to open a file instead of
+requesting kernel to dump information about TCP sockets.
+
+
+<p> This option is used mainly to investigate bug reports,
+when dumps of files usually found in <tt>/proc/</tt> are recevied
+by e-mail.
+
+<sect1> Output format
+
+<p>Six columns. The first is <tt/Netid/, it denotes socket type and
+transport protocol, when it is ambiguous: <tt/tcp/, <tt/udp/, <tt/raw/,
+<tt/u_str/ is abbreviation for <tt/unix_stream/, <tt/u_dgr/ for UNIX
+datagram sockets, <tt/nl/ for netlink, <tt/p_raw/ and <tt/p_dgr/ for
+raw and datagram packet sockets. This column is optional, it will
+be hidden, if filter selects an unique netid.
+
+<p>
+The second column is <tt/State/. Socket state is displayed here.
+The names are standard TCP names, except for <tt/UNCONN/, which
+cannot happen for TCP, but normal for not connected sockets
+of another types. Again, this column can be hidden.
+
+<p>
+Then two columns (<tt/Recv-Q/ and <tt/Send-Q/) showing amount of data
+queued for receive and transmit.
+
+<p>
+And the last two columns display local address and port of the socket
+and its peer address, if the socket is connected.
+
+<p>
+If options <tt/-o/, <tt/-e/ or <tt/-p/ were given, options are
+displayed not in fixed positions but separated by spaces pairs:
+<tt/option:value/. If value is not a single number, it is presented
+as list of values, enclosed to <tt/(/ ... <tt/)/ and separated with
+commas. F.e.
+
+<tscreen><verb>
+ timer:(keepalive,111min,0)
+</verb></tscreen>
+is typical format for TCP timer (option <tt/-o/).
+
+<tscreen><verb>
+ users:((X,113,3))
+</verb></tscreen>
+is typical for list of users (option <tt/-p/).
+
+
+<sect>Some numbers
+
+<p>
+Well, let us use <tt/pidentd/ and a tool <tt/ibench/ to measure
+its performance. It is 30 requests per second here. Nothing to test,
+it is too slow. OK, let us patch pidentd with patch from directory
+Patches. After this it handles about 4300 requests per second
+and becomes handy tool to pollute socket tables with lots of timewait
+buckets.
+
+<p>
+So, each test starts from pollution tables with 30000 sockets
+and then doing full dump of the table piped to wc and measuring
+timings with time:
+
+<p>Results:
+
+<itemize>
+<item> <tt/netstat -at/ - 15.6 seconds
+<item> <tt/ss -atr/, but without <tt/tcp_diag/ - 5.4 seconds
+<item> <tt/ss -atr/ with <tt/tcp_diag/ - 0.47 seconds
+</itemize>
+
+No comments. Though one comment is necessary, most of time
+without <tt/tcp_diag/ is wasted inside kernel with completely
+blocked networking. More than 10 seconds, yes. <tt/tcp_diag/
+does the same work for 100 milliseconds of system time.
+
+</article>
diff --git a/etc/iproute2/rt_dsfield b/etc/iproute2/rt_dsfield
index e69de29b..110061a8 100644
--- a/etc/iproute2/rt_dsfield
+++ b/etc/iproute2/rt_dsfield
@@ -0,0 +1,13 @@
+0x10 lowdelay
+0x08 throughput
+0x04 reliability
+# This value overlap with ECT, do not use it!
+0x02 mincost
+# These values seems do not want to die, Cisco likes them by a strange reason.
+0x20 priority
+0x40 immediate
+0x60 flash
+0x80 flash-override
+0xa0 critical
+0xc0 internet
+0xe0 network
diff --git a/etc/iproute2/rt_protos b/etc/iproute2/rt_protos
index e69de29b..8c985d79 100644
--- a/etc/iproute2/rt_protos
+++ b/etc/iproute2/rt_protos
@@ -0,0 +1,25 @@
+#
+# Reserved protocols.
+#
+0 unspec
+1 redirect
+2 kernel
+3 boot
+4 static
+8 gated
+9 ra
+10 mrt
+11 zebra
+12 bird
+#
+# Used by me for gated
+#
+254 gated/aggr
+253 gated/bgp
+252 gated/ospf
+251 gated/ospfase
+250 gated/rip
+249 gated/static
+248 gated/conn
+247 gated/inet
+246 gated/default
diff --git a/etc/iproute2/rt_realms b/etc/iproute2/rt_realms
index e69de29b..eedd76d2 100644
--- a/etc/iproute2/rt_realms
+++ b/etc/iproute2/rt_realms
@@ -0,0 +1,13 @@
+#
+# reserved values
+#
+0 cosmos
+#
+# local
+#
+#1 inr.ac
+#2 inr.ruhep
+#3 freenet
+#4 radio-msu
+#5 russia
+#6 internet
diff --git a/etc/iproute2/rt_scopes b/etc/iproute2/rt_scopes
index e69de29b..8514bc11 100644
--- a/etc/iproute2/rt_scopes
+++ b/etc/iproute2/rt_scopes
@@ -0,0 +1,11 @@
+#
+# reserved values
+#
+0 global
+255 nowhere
+254 host
+253 link
+#
+# pseudo-reserved
+#
+200 site
diff --git a/etc/iproute2/rt_tables b/etc/iproute2/rt_tables
index e69de29b..541abfd2 100644
--- a/etc/iproute2/rt_tables
+++ b/etc/iproute2/rt_tables
@@ -0,0 +1,11 @@
+#
+# reserved values
+#
+255 local
+254 main
+253 default
+0 unspec
+#
+# local
+#
+#1 inr.ruhep
diff --git a/examples/SYN-DoS.rate.limit b/examples/SYN-DoS.rate.limit
index e69de29b..8766b679 100644
--- a/examples/SYN-DoS.rate.limit
+++ b/examples/SYN-DoS.rate.limit
@@ -0,0 +1,49 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities
+# this script shows how one can rate limit incoming SYNs
+# Useful for TCP-SYN attack protection. You can use
+# IPchains to have more powerful additions to the SYN (eg
+# in addition the subnet)
+#
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
+INDEV=eth2
+#
+# tag all incoming SYN packets through $INDEV as mark value 1
+############################################################
+$IPCHAINS -A input -i $INDEV -y -m 1
+############################################################
+#
+# install the ingress qdisc on the ingress interface
+############################################################
+$TC qdisc add dev $INDEV handle ffff: ingress
+############################################################
+
+#
+#
+# SYN packets are 40 bytes (320 bits) so three SYNs equals
+# 960 bits (approximately 1kbit); so we rate limit below
+# the incoming SYNs to 3/sec (not very sueful really; but
+#serves to show the point - JHS
+############################################################
+$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 1 fw \
+police rate 1kbit burst 40 mtu 9k drop flowid :1
+############################################################
+
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+#deleting the ingress qdisc
+#$TC qdisc del $INDEV ingress
diff --git a/examples/cbqinit.eth1 b/examples/cbqinit.eth1
index e69de29b..226ec1c5 100644
--- a/examples/cbqinit.eth1
+++ b/examples/cbqinit.eth1
@@ -0,0 +1,76 @@
+#! /bin/sh
+
+TC=/home/root/tc
+IP=/home/root/ip
+DEVICE=eth1
+BANDWIDTH="bandwidth 10Mbit"
+
+# Attach CBQ on $DEVICE. It will have handle 1:.
+# $BANDWIDTH is real $DEVICE bandwidth (10Mbit).
+# avpkt is average packet size.
+# mpu is minimal packet size.
+
+$TC qdisc add dev $DEVICE root handle 1: cbq \
+$BANDWIDTH avpkt 1000 mpu 64
+
+# Create root class with classid 1:1. This step is not necessary.
+# bandwidth is the same as on CBQ itself.
+# rate == all the bandwidth
+# allot is MTU + MAC header
+# maxburst measure allowed class burstiness (please,read S.Floyd and VJ papers)
+# est 1sec 8sec means, that kernel will evaluate average rate
+# on this class with period 1sec and time constant 8sec.
+# This rate is viewed with "tc -s class ls dev $DEVICE"
+
+$TC class add dev $DEVICE parent 1:0 classid :1 est 1sec 8sec cbq \
+$BANDWIDTH rate 10Mbit allot 1514 maxburst 50 avpkt 1000
+
+# Bulk.
+# New parameters are:
+# weight, which is set to be proportional to
+# "rate". It is not necessary, weight=1 will work as well.
+# defmap and split say that best effort ttraffic, not classfied
+# by another means will fall to this class.
+
+$TC class add dev $DEVICE parent 1:1 classid :2 est 1sec 8sec cbq \
+$BANDWIDTH rate 4Mbit allot 1514 weight 500Kbit \
+prio 6 maxburst 50 avpkt 1000 split 1:0 defmap ff3d
+
+# OPTIONAL.
+# Attach "sfq" qdisc to this class, quantum is MTU, perturb
+# gives period of hash function perturbation in seconds.
+#
+$TC qdisc add dev $DEVICE parent 1:2 sfq quantum 1514b perturb 15
+
+# Interactive-burst class
+
+$TC class add dev $DEVICE parent 1:1 classid :3 est 2sec 16sec cbq \
+$BANDWIDTH rate 1Mbit allot 1514 weight 100Kbit \
+prio 2 maxburst 100 avpkt 1000 split 1:0 defmap c0
+
+$TC qdisc add dev $DEVICE parent 1:3 sfq quantum 1514b perturb 15
+
+# Background.
+
+$TC class add dev $DEVICE parent 1:1 classid :4 est 1sec 8sec cbq \
+ $BANDWIDTH rate 100Kbit allot 1514 weight 10Mbit \
+ prio 7 maxburst 10 avpkt 1000 split 1:0 defmap 2
+
+$TC qdisc add dev $DEVICE parent 1:4 sfq quantum 1514b perturb 15
+
+# Realtime class for RSVP
+
+$TC class add dev $DEVICE parent 1:1 classid 1:7FFE cbq \
+rate 5Mbit $BANDWIDTH allot 1514b avpkt 1000 \
+maxburst 20
+
+# Reclassified realtime traffic
+#
+# New element: split is not 1:0, but 1:7FFE. It means,
+# that only real-time packets, which violated policing filters
+# or exceeded reshaping buffers will fall to it.
+
+$TC class add dev $DEVICE parent 1:7FFE classid 1:7FFF est 4sec 32sec cbq \
+rate 1Mbit $BANDWIDTH allot 1514b avpkt 1000 weight 10Kbit \
+prio 6 maxburst 10 split 1:7FFE defmap ffff
+
diff --git a/examples/dhcp-client-script b/examples/dhcp-client-script
index e69de29b..7207b57d 100644
--- a/examples/dhcp-client-script
+++ b/examples/dhcp-client-script
@@ -0,0 +1,446 @@
+#!/bin/bash
+#
+# dhclient-script for Linux.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version
+# 2 of the License, or (at your option) any later version.
+#
+# Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+#
+# Probably, I did not understand, what this funny feature as "alias"
+# means exactly. For now I suppose, that it is a static address, which
+# we should install and preserve.
+#
+
+exec >> /tmp/DHS.log 2>&1
+
+echo dhc-script $* reason=$reason
+set | grep "^\(old_\|new_\|check_\)"
+
+LOG () {
+ echo LOG $* ;
+}
+
+# convert 8bit mask to length
+# arg: $1 = mask
+#
+Mask8ToLen() {
+ local l=0;
+
+ while [ $l -le 7 ]; do
+ if [ $[ ( 1 << $l ) + $1 ] -eq 256 ]; then
+ return $[ 8 - $l ]
+ fi
+ l=$[ $l + 1 ]
+ done
+ return 0;
+}
+
+# convert inet dotted quad mask to length
+# arg: $1 = dotquad mask
+#
+MaskToLen() {
+ local masklen=0
+ local mask8=$1
+
+ case $1 in
+ 0.0.0.0)
+ return 0;
+ ;;
+ 255.*.0.0)
+ masklen=8
+ mask8=${mask8#255.}
+ mask8=${mask8%.0.0}
+ ;;
+ 255.255.*.0)
+ masklen=16
+ mask8=${mask8#255.255.}
+ mask8=${mask8%.0}
+ ;;
+ 255.255.255.*)
+ masklen=24
+ mask8=${mask8#255.255.255.}
+ ;;
+ *)
+ return 255
+ ;;
+ esac
+ Mask8ToLen $mask8
+ return $[ $? + $masklen ]
+}
+
+# calculate ABC "natural" mask
+# arg: $1 = dotquad address
+#
+ABCMask () {
+ local class;
+
+ class=${1%%.*}
+
+ if [ "$1" = "255.255.255.255" ]; then
+ echo $1
+ elif [ "$1" = "0.0.0.0" ]; then
+ echo $1
+ elif [ $class -ge 224 ]; then
+ echo 240.0.0.0
+ elif [ $class -ge 192 ]; then
+ echo 255.255.255.0
+ elif [ $class -ge 128 ]; then
+ echo 255.255.0.0
+ else
+ echo 255.0.0.0
+ fi
+}
+
+# calculate ABC "natural" mask length
+# arg: $1 = dotquad address
+#
+ABCMaskLen () {
+ local class;
+
+ class=${1%%.*}
+
+ if [ "$1" = "255.255.255.255" ]; then
+ return 32
+ elif [ "$1" = "0.0.0.0" ]; then
+ return 0
+ elif [ $class -ge 224 ]; then
+ return 4;
+ elif [ $class -ge 192 ]; then
+ return 24;
+ elif [ $class -ge 128 ]; then
+ return 16;
+ else
+ return 8;
+ fi
+}
+
+# Delete IP address
+# args: $1 = interface
+# $2 = address
+# $3 = mask
+# $4 = broadcast
+# $5 = label
+#
+DelINETAddr () {
+ local masklen=32
+ local addrid=$1
+
+ LOG DelINETAddr $*
+
+ if [ "$5" ]; then
+ addrid=$addrid:$5
+ fi
+ LOG ifconfig $addrid down
+ ifconfig $addrid down
+}
+
+# Add IP address
+# args: $1 = interface
+# $2 = address
+# $3 = mask
+# $4 = broadcast
+# $5 = label
+#
+AddINETAddr () {
+ local mask_arg
+ local brd_arg
+ local addrid=$1
+
+ LOG AddINETAddr $*
+
+ if [ "$5" ]; then
+ addrid=$addrid:$5
+ fi
+ if [ "$3" ]; then
+ mask_arg="netmask $3"
+ fi
+ if [ "$4" ]; then
+ brd_arg="broadcast $4"
+ fi
+
+ LOG ifconfig $addrid $2 $mask_arg $brd_arg up
+ ifconfig $addrid $2 $mask_arg $brd_arg up
+}
+
+# Add default routes
+# args: $1 = routers list
+#
+AddDefaultRoutes() {
+ local router
+
+ if [ "$1" ]; then
+ LOG AddDefaultRoutes $*
+ for router in $1; do
+ LOG route add default gw $router
+ route add default gw $router
+ done ;
+ fi
+}
+
+# Delete default routes
+# args: $1 = routers list
+#
+DelDefaultRoutes() {
+ local router
+
+ if [ "$1" ]; then
+ LOG DelDefaultRoutes $*
+
+ for router in $1; do
+ LOG route del default gw $router
+ route del default gw $router
+ done
+ fi
+}
+
+# ping a host
+# args: $1 = dotquad address of the host
+#
+PingNode() {
+ LOG PingNode $*
+ if ping -q -c 1 -w 2 $1 ; then
+ return 0;
+ fi
+ return 1;
+}
+
+# Check (and add route, if alive) default routers
+# args: $1 = routers list
+# returns: 0 if at least one router is alive.
+#
+CheckRouterList() {
+ local router
+ local succeed=1
+
+ LOG CheckRouterList $*
+
+ for router in $1; do
+ if PingNode $router ; then
+ succeed=0
+ route add default gw $router
+ fi
+ done
+ return $succeed
+}
+
+# Delete/create static routes.
+# args: $1 = operation (del/add)
+# $2 = routes list in format "dst1 nexthop1 dst2 ..."
+#
+# BEWARE: this feature of DHCP is obsolete, because does not
+# support subnetting.
+#
+X-StaticRouteList() {
+ local op=$1
+ local lst="$2"
+ local masklen
+
+ LOG X-StaticRouteList $*
+
+ if [ "$lst" ]; then
+ set $lst
+ while [ $# -gt 1 ]; do
+ route $op -net $1 netmask `ABCMask "$1"` gw $2
+ shift; shift;
+ done
+ fi
+}
+
+# Create static routes.
+# arg: $1 = routes list in format "dst1 nexthop1 dst2 ..."
+#
+AddStaticRouteList() {
+ LOG AddStaticRouteList $*
+ X-StaticRouteList add "$1"
+}
+
+# Delete static routes.
+# arg: $1 = routes list in format "dst1 nexthop1 dst2 ..."
+#
+DelStaticRouteList() {
+ LOG DelStaticRouteList $*
+ X-StaticRouteList del "$1"
+}
+
+# Broadcast unsolicited ARP to update neighbours' caches.
+# args: $1 = interface
+# $2 = address
+#
+UnsolicitedARP() {
+ if [ -f /sbin/arping ]; then
+ /sbin/arping -A -c 1 -I "$1" "$2" &
+ (sleep 2 ; /sbin/arping -U -c 1 -I "$1" "$2" ) &
+ fi
+}
+
+# Duplicate address detection.
+# args: $1 = interface
+# $2 = test address
+# returns: 0, if DAD succeeded.
+DAD() {
+ if [ -f /sbin/arping ]; then
+ /sbin/arping -c 2 -w 3 -D -I "$1" "$2"
+ return $?
+ fi
+ return 0
+}
+
+
+# Setup resolver.
+# args: NO
+# domain and nameserver list are passed in global variables.
+#
+# NOTE: we try to be careful and not to break user supplied resolv.conf.
+# The script mangles it, only if it has dhcp magic signature.
+#
+UpdateDNS() {
+ local nameserver
+ local idstring="#### Generated by DHCPCD"
+
+ LOG UpdateDNS $*
+
+ if [ "$new_domain_name" = "" -a "$new_domain_name_servers" = "" ]; then
+ return 0;
+ fi
+
+ echo $idstring > /etc/resolv.conf.dhcp
+ if [ "$new_domain_name" ]; then
+ echo search $new_domain_name >> /etc/resolv.conf.dhcp
+ fi
+ echo options ndots:1 >> /etc/resolv.conf.dhcp
+
+ if [ "$new_domain_name_servers" ]; then
+ for nameserver in $new_domain_name_servers; do
+ echo nameserver $nameserver >> /etc/resolv.conf.dhcp
+ done
+ else
+ echo nameserver 127.0.0.1 >> /etc/resolv.conf.dhcp
+ fi
+
+ if [ -f /etc/resolv.conf ]; then
+ if [ "`head -1 /etc/resolv.conf`" != "$idstring" ]; then
+ return 0
+ fi
+ if [ "$old_domain_name" = "$new_domain_name" -a
+ "$new_domain_name_servers" = "$old_domain_name_servers" ]; then
+ return 0
+ fi
+ fi
+ mv /etc/resolv.conf.dhcp /etc/resolv.conf
+}
+
+case $reason in
+NBI)
+ exit 1
+ ;;
+
+MEDIUM)
+ exit 0
+ ;;
+
+PREINIT)
+ ifconfig $interface:dhcp down
+ ifconfig $interface:dhcp1 down
+ if [ -d /proc/sys/net/ipv4/conf/$interface ]; then
+ ifconfig $interface:dhcp 10.10.10.10 netmask 255.255.255.255
+ ifconfig $interface:dhcp down
+ if [ -d /proc/sys/net/ipv4/conf/$interface ]; then
+ LOG The interface $interface already configured.
+ fi
+ fi
+ ifconfig $interface:dhcp up
+ exit 0
+ ;;
+
+ARPSEND)
+ exit 0
+ ;;
+
+ARPCHECK)
+ if DAD "$interface" "$check_ip_address" ; then
+ exit 0
+ fi
+ exit 1
+ ;;
+
+BOUND|RENEW|REBIND|REBOOT)
+ if [ "$old_ip_address" -a "$alias_ip_address" -a \
+ "$alias_ip_address" != "$old_ip_address" ]; then
+ DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+ if [ "$old_ip_address" -a "$old_ip_address" != "$new_ip_address" ]; then
+ DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp
+ DelDefaultRoutes "$old_routers"
+ DelStaticRouteList "$old_static_routes"
+ fi
+ if [ "$old_ip_address" = "" -o "$old_ip_address" != "$new_ip_address" -o \
+ "$reason" = "BOUND" -o "$reason" = "REBOOT" ]; then
+ AddINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp
+ AddStaticRouteList "$new_static_routes"
+ AddDefaultRoutes "$new_routers"
+ UnsolicitedARP "$interface" "$new_ip_address"
+ fi
+ if [ "$new_ip_address" != "$alias_ip_address" -a "$alias_ip_address" ]; then
+ AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+ UpdateDNS
+ exit 0
+ ;;
+
+EXPIRE|FAIL)
+ if [ "$alias_ip_address" ]; then
+ DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+ if [ "$old_ip_address" ]; then
+ DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp
+ DelDefaultRoutes "$old_routers"
+ DelStaticRouteList "$old_static_routes"
+ fi
+ if [ "$alias_ip_address" ]; then
+ AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+ exit 0
+ ;;
+
+TIMEOUT)
+ if [ "$alias_ip_address" ]; then
+ DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+# Seems, <null address> means, that no more old leases found.
+# Or does it mean bug in dhcpcd? 8) Fail for now.
+ if [ "$new_ip_address" = "<null address>" ]; then
+ if [ "$old_ip_address" ]; then
+ DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp
+ fi
+ if [ "$alias_ip_address" ]; then
+ AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+ exit 1
+ fi
+ if DAD "$interface" "$new_ip_address" ; then
+ AddINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp
+ UnsolicitedARP "$interface" "$new_ip_address"
+ if [ "$alias_ip_address" -a "$alias_ip_address" != "$new_ip_address" ]; then
+ AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ UnsolicitedARP "$interface" "$alias_ip_address"
+ fi
+ if CheckRouterList "$new_routers" ; then
+ AddStaticRouteList "$new_static_routes"
+ UpdateDNS
+ exit 0
+ fi
+ fi
+ DelINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp
+ DelDefaultRoutes "$old_routers"
+ DelStaticRouteList "$old_static_routes"
+ if [ "$alias_ip_address" ]; then
+ AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
+ fi
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/examples/diffserv/Edge1 b/examples/diffserv/Edge1
index e69de29b..4ddffdd1 100644
--- a/examples/diffserv/Edge1
+++ b/examples/diffserv/Edge1
@@ -0,0 +1,68 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities
+# This script just tags on the ingress interfac using Ipchains
+# the result is used for fast classification and re-marking
+# on the egress interface
+#
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
+INDEV=eth2
+EGDEV="dev eth1"
+#
+# tag all incoming packets from host 10.2.0.24 to value 1
+# tag all incoming packets from host 10.2.0.3 to value 2
+# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3
+#These values are used in the egress
+#
+############################################################
+$IPCHAINS -A input -s 10.2.0.4/24 -m 3
+$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1
+$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2
+
+######################## Egress side ########################
+
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 set_tc_index
+#
+# values of the DSCP to change depending on the class
+#
+#becomes EF
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0xb8
+#becomes AF11
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x28
+#becomes AF21
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x48
+#
+#
+# The class mapping
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent 1:0
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
diff --git a/examples/diffserv/Edge2 b/examples/diffserv/Edge2
index e69de29b..2f78da24 100644
--- a/examples/diffserv/Edge2
+++ b/examples/diffserv/Edge2
@@ -0,0 +1,87 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities
+# This script tags the fwmark on the ingress interface using IPchains
+# the result is used first for policing on the Ingress interface then
+# for fast classification and re-marking
+# on the egress interface
+#
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
+INDEV=eth2
+EGDEV="dev eth1"
+#
+# tag all incoming packets from host 10.2.0.24 to value 1
+# tag all incoming packets from host 10.2.0.3 to value 2
+# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3
+#These values are used in the egress
+############################################################
+$IPCHAINS -A input -s 10.2.0.0/24 -m 3
+$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1
+$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2
+############################################################
+#
+# install the ingress qdisc on the ingress interface
+############################################################
+$TC qdisc add dev $INDEV handle ffff: ingress
+############################################################
+
+#
+# attach a fw classifier to the ingress which polices anything marked
+# by ipchains to tag value 3 (The rest of the subnet packets -- not
+# tag 1 or 2) to not go beyond 1.5Mbps
+# Allow up to at least 60 packets to burst (assuming maximum packet
+# size of # 1.5 KB) in the long run and upto about 6 packets in the
+# shot run
+
+############################################################
+$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 3 fw \
+police rate 1500kbit burst 90k mtu 9k drop flowid :1
+############################################################
+
+######################## Egress side ########################
+
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
+#
+# values of the DSCP to change depending on the class
+#
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0xb8
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x28
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x48
+#
+#
+# The class mapping
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
+#
+#deleting the ingress qdisc
+#$TC qdisc del $DEV ingress
diff --git a/examples/diffserv/Edge31-ca-u32 b/examples/diffserv/Edge31-ca-u32
index e69de29b..25e6c0b1 100644
--- a/examples/diffserv/Edge31-ca-u32
+++ b/examples/diffserv/Edge31-ca-u32
@@ -0,0 +1,170 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities using u32 classifier
+# This script tags tcindex based on metering on the ingress
+# interface the result is used for fast classification and re-marking
+# on the egress interface
+# This is an example of a color aware mode marker with PIR configured
+# based on draft-wahjak-mcm-00.txt (section 3.1)
+#
+# The colors are defined using the Diffserv Fields
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/usr/src/iproute2-current
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+INDEV=eth0
+EGDEV="dev eth1"
+CIR1=1500kbit
+CIR2=1000kbit
+
+#The CBS is about 60 MTU sized packets
+CBS1=90k
+CBS2=90k
+
+############################################################
+#
+# install the ingress qdisc on the ingress interface
+$TC qdisc add dev $INDEV handle ffff: ingress
+############################################################
+#
+# Create u32 filters
+$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1: u32 \
+divisor 1
+############################################################
+
+# The meters: Note that we have shared meters in this case as identified
+# by the index parameter
+meter1=" police index 1 rate $CIR1 burst $CBS1 "
+meter2=" police index 2 rate $CIR2 burst $CBS1 "
+meter3=" police index 3 rate $CIR2 burst $CBS2 "
+meter4=" police index 4 rate $CIR1 burst $CBS2 "
+meter5=" police index 5 rate $CIR1 burst $CBS2 "
+
+# All packets are marked with a tcindex value which is used on the egress
+# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
+
+# *********************** AF41 ***************************
+#AF41 (DSCP 0x22) is passed on with a tcindex value 1
+#if it doesnt exceed its CIR/CBS
+#policer 1 is used.
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
+match ip tos 0x88 0xfc \
+$meter1 \
+continue flowid :1
+#
+# if it exceeds the above but not the extra rate/burst below, it gets a
+# tcindex value of 2
+# policer 2 is used
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
+match ip tos 0x88 0xfc \
+$meter2 \
+continue flowid :2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3 (policer 3)
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
+match ip tos 0x88 0xfc \
+$meter3 \
+drop flowid :3
+#
+
+# *********************** AF42 ***************************
+#AF42 (DSCP 0x24) from is passed on with a tcindex value 2
+#if it doesnt exceed its CIR/CBS
+#policer 2 is used. Note that this is shared with the AF41
+#
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
+match ip tos 0x90 0xfc \
+$meter2 \
+continue flowid :2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3 (policer 3)
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
+match ip tos 0x90 0xfc \
+$meter3 \
+drop flowid :3
+#
+# *********************** AF43 ***************************
+#
+#AF43 (DSCP 0x26) from is passed on with a tcindex value 3
+#if it doesnt exceed its CIR/CBS
+#policer 3 is used. Note that this is shared with the AF41 and AF42
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
+match ip tos 0x98 0xfc \
+$meter3 \
+drop flowid :3
+#
+# *********************** BE ***************************
+#
+# Anything else (not from the AF4*) gets discarded if it
+# exceeds 1Mbps and by default goes to BE if it doesnt
+# Note that the BE class is also used by the AF4* in the worst
+# case
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \
+match ip src 0/0\
+$meter4 \
+drop flowid :4
+
+######################## Egress side ########################
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
+#
+# values of the DSCP to change depending on the class
+#note that the ECN bits are masked out
+#
+#AF41 (0x88 is 0x22 shifted to the right by two bits)
+#
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0x88
+#AF42
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x90
+#AF43
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x98
+#BE
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x0
+#
+#
+# The class mapping
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 1 tcindex classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 2 tcindex classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 3 tcindex classid 1:3
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 4 tcindex classid 1:4
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
+#
+#deleting the ingress qdisc
+#$TC qdisc del $INDEV ingress
diff --git a/examples/diffserv/Edge31-cb-chains b/examples/diffserv/Edge31-cb-chains
index e69de29b..d7faae98 100644
--- a/examples/diffserv/Edge31-cb-chains
+++ b/examples/diffserv/Edge31-cb-chains
@@ -0,0 +1,132 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities
+# This script fwmark tags(IPchains) based on metering on the ingress
+# interface the result is used for fast classification and re-marking
+# on the egress interface
+# This is an example of a color blind mode marker with no PIR configured
+# based on draft-wahjak-mcm-00.txt (section 3.1)
+#
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
+INDEV=eth2
+EGDEV="dev eth1"
+CIR1=1500kbit
+CIR2=1000kbit
+
+#The CBS is about 60 MTU sized packets
+CBS1=90k
+CBS2=90k
+
+meter1="police rate $CIR1 burst $CBS1 "
+meter2="police rate $CIR1 burst $CBS2 "
+meter3="police rate $CIR2 burst $CBS1 "
+meter4="police rate $CIR2 burst $CBS2 "
+meter5="police rate $CIR2 burst $CBS2 "
+#
+# tag the rest of incoming packets from subnet 10.2.0.0/24 to fw value 1
+# tag all incoming packets from any other subnet to fw tag 2
+############################################################
+$IPCHAINS -A input -i $INDEV -s 0/0 -m 2
+$IPCHAINS -A input -i $INDEV -s 10.2.0.0/24 -m 1
+#
+############################################################
+# install the ingress qdisc on the ingress interface
+$TC qdisc add dev $INDEV handle ffff: ingress
+#
+############################################################
+
+# All packets are marked with a tcindex value which is used on the egress
+# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
+#
+############################################################
+#
+# anything with fw tag of 1 is passed on with a tcindex value 1
+#if it doesnt exceed its allocated rate (CIR/CBS)
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1 fw \
+$meter1 \
+continue flowid 4:1
+#
+# if it exceeds the above but not the extra rate/burst below, it gets a
+#tcindex value of 2
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 5 handle 1 fw \
+$meter2 \
+continue flowid 4:2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 1 fw \
+$meter3 \
+drop flowid 4:3
+#
+# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it
+# exceeds 1Mbps and by default goes to BE if it doesnt
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 2 fw \
+$meter5 \
+drop flowid 4:4
+
+
+######################## Egress side ########################
+
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
+#
+# values of the DSCP to change depending on the class
+#note that the ECN bits are masked out
+#
+#AF41 (0x88 is 0x22 shifted to the right by two bits)
+#
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0x88
+#AF42
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x90
+#AF43
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x98
+#BE
+$TC class change $EGDEV classid 1:4 dsmark mask 0x3 \
+ value 0x0
+#
+#
+# The class mapping (using tcindex; could easily have
+# replaced it with the fw classifier instead)
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 1 tcindex classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 2 tcindex classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 3 tcindex classid 1:3
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 4 tcindex classid 1:4
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
+#
+#deleting the ingress qdisc
+#$TC qdisc del $INDEV ingress
diff --git a/examples/diffserv/Edge32-ca-u32 b/examples/diffserv/Edge32-ca-u32
index e69de29b..edf21e43 100644
--- a/examples/diffserv/Edge32-ca-u32
+++ b/examples/diffserv/Edge32-ca-u32
@@ -0,0 +1,198 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities using u32 classifier
+# This script tags tcindex based on metering on the ingress
+# interface the result is used for fast classification and re-marking
+# on the egress interface
+# This is an example of a color aware mode marker with PIR configured
+# based on draft-wahjak-mcm-00.txt (section 3.2)
+#
+# The colors are defined using the Diffserv Fields
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
+INDEV=eth2
+EGDEV="dev eth1"
+CIR1=1000kbit
+CIR2=500kbit
+# the PIR is what is in excess of the CIR
+PIR1=1000kbit
+PIR2=500kbit
+
+#The CBS is about 60 MTU sized packets
+CBS1=90k
+CBS2=90k
+#the EBS is about 20 max sized packets
+EBS1=30k
+EBS2=30k
+
+# The meters: Note that we have shared meters in this case as identified
+# by the index parameter
+meter1=" police index 1 rate $CIR1 burst $CBS1 "
+meter1a=" police index 2 rate $PIR1 burst $EBS1 "
+meter2=" police index 3 rate $CIR2 burst $CBS1 "
+meter2a=" police index 4 rate $PIR2 burst $EBS1 "
+meter3=" police index 5 rate $CIR2 burst $CBS2 "
+meter3a=" police index 6 rate $PIR2 burst $EBS2 "
+meter4=" police index 7 rate $CIR1 burst $CBS2 "
+
+############################################################
+#
+# install the ingress qdisc on the ingress interface
+$TC qdisc add dev $INDEV handle ffff: ingress
+############################################################
+#
+# All packets are marked with a tcindex value which is used on the egress
+# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
+#
+# *********************** AF41 ***************************
+#AF41 (DSCP 0x22) from is passed on with a tcindex value 1
+#if it doesnt exceed its CIR/CBS + PIR/EBS
+#policer 1 is used.
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 1 u32 \
+match ip tos 0x88 0xfc \
+$meter1 \
+continue flowid :1
+$TC filter add dev $INDEV parent ffff: protocol ip prio 2 u32 \
+match ip tos 0x88 0xfc \
+$meter1a \
+continue flowid :1
+#
+# if it exceeds the above but not the extra rate/burst below, it gets a
+# tcindex value of 2
+# policer 2 is used
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 3 u32 \
+match ip tos 0x88 0xfc \
+$meter2 \
+continue flowid :2
+$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
+match ip tos 0x88 0xfc \
+$meter2a \
+continue flowid :2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3 (policer 3)
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
+match ip tos 0x88 0xfc \
+$meter3 \
+continue flowid :3
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
+match ip tos 0x88 0xfc \
+$meter3a \
+drop flowid :3
+#
+# *********************** AF42 ***************************
+#AF42 (DSCP 0x24) from is passed on with a tcindex value 2
+#if it doesnt exceed its CIR/CBS + PIR/EBS
+#policer 2 is used. Note that this is shared with the AF41
+#
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 8 u32 \
+match ip tos 0x90 0xfc \
+$meter2 \
+continue flowid :2
+$TC filter add dev $INDEV parent ffff: protocol ip prio 9 u32 \
+match ip tos 0x90 0xfc \
+$meter2a \
+continue flowid :2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3 (policer 3)
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 10 u32 \
+match ip tos 0x90 0xfc \
+$meter3 \
+continue flowid :3
+$TC filter add dev $INDEV parent ffff: protocol ip prio 11 u32 \
+match ip tos 0x90 0xfc \
+$meter3a \
+drop flowid :3
+
+#
+# *********************** AF43 ***************************
+#
+#AF43 (DSCP 0x26) from is passed on with a tcindex value 3
+#if it doesnt exceed its CIR/CBS + PIR/EBS
+#policer 3 is used. Note that this is shared with the AF41 and AF42
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 13 u32 \
+match ip tos 0x98 0xfc \
+$meter3 \
+continue flowid :3
+$TC filter add dev $INDEV parent ffff: protocol ip prio 14 u32 \
+match ip tos 0x98 0xfc \
+$meter3a \
+drop flowid :3
+#
+## *********************** BE ***************************
+##
+## Anything else (not from the AF4*) gets discarded if it
+## exceeds 1Mbps and by default goes to BE if it doesnt
+## Note that the BE class is also used by the AF4* in the worst
+## case
+##
+$TC filter add dev $INDEV parent ffff: protocol ip prio 16 u32 \
+match ip src 0/0\
+$meter4 \
+drop flowid :4
+
+######################## Egress side ########################
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
+#
+# values of the DSCP to change depending on the class
+#note that the ECN bits are masked out
+#
+#AF41 (0x88 is 0x22 shifted to the right by two bits)
+#
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0x88
+#AF42
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x90
+#AF43
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x98
+#BE
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x0
+#
+#
+# The class mapping
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 1 tcindex classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 2 tcindex classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 3 tcindex classid 1:3
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 4 tcindex classid 1:4
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
+#
+#deleting the ingress qdisc
+#$TC qdisc del $INDEV ingress
diff --git a/examples/diffserv/Edge32-cb-chains b/examples/diffserv/Edge32-cb-chains
index e69de29b..804fad19 100644
--- a/examples/diffserv/Edge32-cb-chains
+++ b/examples/diffserv/Edge32-cb-chains
@@ -0,0 +1,144 @@
+#! /bin/sh -x
+#
+# sample script on using the ingress capabilities
+# This script fwmark tags(IPchains) based on metering on the ingress
+# interface the result is used for fast classification and re-marking
+# on the egress interface
+# This is an example of a color blind mode marker with no PIR configured
+# based on draft-wahjak-mcm-00.txt (section 3.1)
+#
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
+INDEV=eth2
+EGDEV="dev eth1"
+CIR1=1500kbit
+CIR2=500kbit
+
+#The CBS is about 60 MTU sized packets
+CBS1=90k
+CBS2=90k
+
+meter1="police rate $CIR1 burst $CBS1 "
+meter1a="police rate $CIR2 burst $CBS1 "
+meter2="police rate $CIR1 burst $CBS2 "
+meter2a="police rate $CIR2 burst $CBS2 "
+meter3="police rate $CIR2 burst $CBS1 "
+meter3a="police rate $CIR2 burst $CBS1 "
+meter4="police rate $CIR2 burst $CBS2 "
+meter5="police rate $CIR1 burst $CBS2 "
+#
+# tag the rest of incoming packets from subnet 10.2.0.0/24 to fw value 1
+# tag all incoming packets from any other subnet to fw tag 2
+############################################################
+$IPCHAINS -A input -i $INDEV -s 0/0 -m 2
+$IPCHAINS -A input -i $INDEV -s 10.2.0.0/24 -m 1
+#
+############################################################
+# install the ingress qdisc on the ingress interface
+$TC qdisc add dev $INDEV handle ffff: ingress
+#
+############################################################
+
+# All packets are marked with a tcindex value which is used on the egress
+# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
+#
+############################################################
+#
+# anything with fw tag of 1 is passed on with a tcindex value 1
+#if it doesnt exceed its allocated rate (CIR/CBS)
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 1 handle 1 fw \
+$meter1 \
+continue flowid 4:1
+$TC filter add dev $INDEV parent ffff: protocol ip prio 2 handle 1 fw \
+$meter1a \
+continue flowid 4:1
+#
+# if it exceeds the above but not the extra rate/burst below, it gets a
+#tcindex value of 2
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 3 handle 1 fw \
+$meter2 \
+continue flowid 4:2
+$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1 fw \
+$meter2a \
+continue flowid 4:2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 5 handle 1 fw \
+$meter3 \
+continue flowid 4:3
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 1 fw \
+$meter3a \
+drop flowid 4:3
+#
+# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it
+# exceeds 1Mbps and by default goes to BE if it doesnt
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 7 handle 2 fw \
+$meter5 \
+drop flowid 4:4
+
+
+######################## Egress side ########################
+
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
+#
+# values of the DSCP to change depending on the class
+#note that the ECN bits are masked out
+#
+#AF41 (0x88 is 0x22 shifted to the right by two bits)
+#
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0x88
+#AF42
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x90
+#AF43
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x98
+#BE
+$TC class change $EGDEV classid 1:4 dsmark mask 0x3 \
+ value 0x0
+#
+#
+# The class mapping (using tcindex; could easily have
+# replaced it with the fw classifier instead)
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 1 tcindex classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 2 tcindex classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 3 tcindex classid 1:3
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 4 tcindex classid 1:4
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
+#
+#deleting the ingress qdisc
+#$TC qdisc del $INDEV ingress
diff --git a/examples/diffserv/Edge32-cb-u32 b/examples/diffserv/Edge32-cb-u32
index e69de29b..cc2ebb40 100644
--- a/examples/diffserv/Edge32-cb-u32
+++ b/examples/diffserv/Edge32-cb-u32
@@ -0,0 +1,145 @@
+#! /bin/sh
+#
+# sample script on using the ingress capabilities using u32 classifier
+# This script tags tcindex based on metering on the ingress
+# interface the result is used for fast classification and re-marking
+# on the egress interface
+# This is an example of a color blind mode marker with PIR configured
+# based on draft-wahjak-mcm-00.txt (section 3.2)
+#
+#path to various utilities;
+#change to reflect yours.
+#
+IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
+TC=$IPROUTE/tc/tc
+IP=$IPROUTE/ip/ip
+INDEV=eth2
+EGDEV="dev eth1"
+CIR1=1000kbit
+CIR2=1000kbit
+# The PIR is the excess (in addition to the CIR i.e if always
+# going to the PIR --> average rate is CIR+PIR)
+PIR1=1000kbit
+PIR2=500kbit
+
+#The CBS is about 60 MTU sized packets
+CBS1=90k
+CBS2=90k
+#the EBS is about 10 max sized packets
+EBS1=15k
+EBS2=15k
+# The meters
+meter1=" police rate $CIR1 burst $CBS1 "
+meter1a=" police rate $PIR1 burst $EBS1 "
+meter2=" police rate $CIR2 burst $CBS1 "
+meter2a="police rate $PIR2 burst $CBS1 "
+meter3=" police rate $CIR2 burst $CBS2 "
+meter3a=" police rate $PIR2 burst $EBS2 "
+meter4=" police rate $CIR1 burst $CBS2 "
+meter5=" police rate $CIR1 burst $CBS2 "
+
+
+# install the ingress qdisc on the ingress interface
+############################################################
+$TC qdisc add dev $INDEV handle ffff: ingress
+############################################################
+#
+############################################################
+
+# All packets are marked with a tcindex value which is used on the egress
+# NOTE: tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
+#
+#anything from subnet 10.2.0.2/24 is passed on with a tcindex value 1
+#if it doesnt exceed its CIR/CBS + PIR/EBS
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 1 u32 \
+match ip src 10.2.0.0/24 $meter1 \
+continue flowid :1
+$TC filter add dev $INDEV parent ffff: protocol ip prio 2 u32 \
+match ip src 10.2.0.0/24 $meter1a \
+continue flowid :1
+
+#
+# if it exceeds the above but not the extra rate/burst below, it gets a
+#tcindex value of 2
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 3 u32 \
+match ip src 10.2.0.0/24 $meter2 \
+continue flowid :2
+$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
+match ip src 10.2.0.0/24 $meter2a \
+continue flowid :2
+#
+# if it exceeds the above but not the rule below, it gets a tcindex value
+# of 3
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
+match ip src 10.2.0.0/24 $meter3 \
+continue flowid :3
+$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
+match ip src 10.2.0.0/24 $meter3a \
+drop flowid :3
+#
+#
+# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it
+# exceeds 1Mbps and by default goes to BE if it doesnt
+#
+$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \
+match ip src 0/0 $meter5 \
+drop flowid :4
+
+
+######################## Egress side ########################
+
+
+# attach a dsmarker
+#
+$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
+#
+# values of the DSCP to change depending on the class
+#note that the ECN bits are masked out
+#
+#AF41 (0x88 is 0x22 shifted to the right by two bits)
+#
+$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
+ value 0x88
+#AF42
+$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
+ value 0x90
+#AF43
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x98
+#BE
+$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
+ value 0x0
+#
+#
+# The class mapping
+#
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 1 tcindex classid 1:1
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 2 tcindex classid 1:2
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 3 tcindex classid 1:3
+$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
+ handle 4 tcindex classid 1:4
+#
+
+#
+echo "---- qdisc parameters Ingress ----------"
+$TC qdisc ls dev $INDEV
+echo "---- Class parameters Ingress ----------"
+$TC class ls dev $INDEV
+echo "---- filter parameters Ingress ----------"
+$TC filter ls dev $INDEV parent ffff:
+
+echo "---- qdisc parameters Egress ----------"
+$TC qdisc ls $EGDEV
+echo "---- Class parameters Egress ----------"
+$TC class ls $EGDEV
+echo "---- filter parameters Egress ----------"
+$TC filter ls $EGDEV parent 1:0
+#
+#deleting the ingress qdisc
+#$TC qdisc del $INDEV ingress
diff --git a/examples/diffserv/README b/examples/diffserv/README
index e69de29b..ec91d632 100644
--- a/examples/diffserv/README
+++ b/examples/diffserv/README
@@ -0,0 +1,98 @@
+
+Note all these are mere examples which can be customized to your needs
+
+AFCBQ
+-----
+AF PHB built using CBQ, DSMARK,GRED (default in GRIO mode) ,RED for BE
+and the tcindex classifier with some algorithmic mapping
+
+EFCBQ
+-----
+EF PHB built using CBQ (for rate control and prioritization),
+DSMARK( to remark DSCPs), tcindex classifier and RED for the BE
+traffic.
+
+EFPRIO
+------
+EF PHB using the PRIO scheduler, Token Bucket to rate control EF,
+tcindex classifier, DSMARK to remark, and RED for the BE traffic
+
+EDGE scripts
+==============
+
+CB-3(1|2)-(u32/chains)
+======================
+
+
+The major differences are that the classifier is u32 on -u32 extension
+and IPchains on the chains extension. CB stands for color Blind
+and 31 is for the mode where only a CIR and CBS are defined whereas
+32 stands for a mode where a CIR/CBS + PIR/EBS are defined.
+
+Color Blind (CB)
+==========-----=
+We look at one special subnet that we are interested in for simplicty
+reasons to demonstrate the capability. We send the packets from that
+subnet to AF4*, BE or end up dropping depending on the metering results.
+
+
+The algorithm overview is as follows:
+
+*classify:
+
+**case: subnet X
+----------------
+ if !exceed meter1 tag as AF41
+ else
+ if !exceed meter2 tag as AF42
+ else
+ if !exceed meter 3 tag as AF43
+ else
+ drop
+
+default case: Any other subnet
+-------------------------------
+ if !exceed meter 5 tag as AF43
+ else
+ drop
+
+
+One Egress side change the DSCPs of the packets to reflect AF4* and BE
+based on the tags from the ingress.
+
+-------------------------------------------------------------
+
+Color Aware
+===========
+
+Define some meters with + policing and give them IDs eg
+
+meter1=police index 1 rate $CIR1 burst $CBS1
+meter2=police index 2 rate $CIR2 burst $CBS2 etc
+
+General overview:
+classify based on the DSCPs and use the policer ids to decide tagging
+
+
+*classify on ingress:
+
+switch (dscp) {
+ case AF41: /* tos&0xfc == 0x88 */
+ if (!exceed meter1) break;
+ case AF42: /* tos&0xfc == 0x90 */
+ if (!exceed meter2) {
+ tag as AF42;
+ break;
+ }
+ case AF43: /* tos&0xfc == 0x98 */
+ if (!exceed meter3) {
+ tag as AF43;
+ break;
+ } else
+ drop;
+ default:
+ if (!exceed meter4) tag as BE;
+ else drop;
+}
+
+On the Egress side mark the proper AF tags
diff --git a/examples/diffserv/afcbq b/examples/diffserv/afcbq
index e69de29b..10d6d934 100644
--- a/examples/diffserv/afcbq
+++ b/examples/diffserv/afcbq
@@ -0,0 +1,105 @@
+#!/usr/bin/perl
+#
+#
+# AF using CBQ for a single interface eth0
+# 4 AF classes using GRED and one BE using RED
+# Things you might want to change:
+# - the device bandwidth (set at 10Mbits)
+# - the bandwidth allocated for each AF class and the BE class
+# - the drop probability associated with each AF virtual queue
+#
+# AF DSCP values used (based on AF draft 04)
+# -----------------------------------------
+# AF DSCP values
+# AF1 1. 0x0a 2. 0x0c 3. 0x0e
+# AF2 1. 0x12 2. 0x14 3. 0x16
+# AF3 1. 0x1a 2. 0x1c 3. 0x1e
+# AF4 1. 0x22 2. 0x24 3. 0x26
+
+#
+#
+# A simple DSCP-class relationship formula used to generate
+# values in the for loop of this script; $drop stands for the
+# DP
+# $dscp = ($class*8+$drop*2)
+#
+# if you use GRIO buffer sharing, then GRED priority is set as follows:
+# $gprio=$drop+1;
+#
+
+$TC = "/usr/src/iproute2-current/tc/tc";
+$DEV = "dev lo";
+$DEV = "dev eth1";
+$DEV = "dev eth0";
+# the BE-class number
+$beclass = "5";
+
+#GRIO buffer sharing on or off?
+$GRIO = "";
+$GRIO = "grio";
+# The bandwidth of your device
+$linerate="10Mbit";
+# The BE and AF rates
+%rate_table=();
+$berate="1500Kbit";
+$rate_table{"AF1rate"}="1500Kbit";
+$rate_table{"AF2rate"}="1500Kbit";
+$rate_table{"AF3rate"}="1500Kbit";
+$rate_table{"AF4rate"}="1500Kbit";
+#
+#
+#
+print "\n# --- General setup ---\n";
+print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
+print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex mask 0xfc " .
+ "shift 2 pass_on\n";
+ #"shift 2\n";
+print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth $linerate ".
+ "cell 8 avpkt 1000 mpu 64\n";
+print "$TC filter add $DEV parent 2:0 protocol ip prio 1 tcindex ".
+ "mask 0xf0 shift 4 pass_on\n";
+for $class (1..4) {
+ print "\n# --- AF Class $class specific setup---\n";
+ $AFrate=sprintf("AF%drate",$class);
+ print "$TC class add $DEV parent 2:0 classid 2:$class cbq ".
+ "bandwidth $linerate rate $rate_table{$AFrate} avpkt 1000 prio ".
+ (6-$class)." bounded allot 1514 weight 1 maxburst 21\n";
+ print "$TC filter add $DEV parent 2:0 protocol ip prio 1 handle $class ".
+ "tcindex classid 2:$class\n";
+ print "$TC qdisc add $DEV parent 2:$class gred setup DPs 3 default 2 ".
+ "$GRIO\n";
+#
+# per DP setup
+#
+ for $drop (1..3) {
+ print "\n# --- AF Class $class DP $drop---\n";
+ $dscp = $class*8+$drop*2;
+ $tcindex = sprintf("1%x%x",$class,$drop);
+ print "$TC filter add $DEV parent 1:0 protocol ip prio 1 ".
+ "handle $dscp tcindex classid 1:$tcindex\n";
+ $prob = $drop*0.02;
+ if ($GRIO) {
+ $gprio = $drop+1;
+ print "$TC qdisc change $DEV parent 2:$class gred limit 60KB min 15KB ".
+ "max 45KB burst 20 avpkt 1000 bandwidth $linerate DP $drop ".
+ "probability $prob ".
+ "prio $gprio\n";
+ } else {
+ print "$TC qdisc change $DEV parent 2:$class gred limit 60KB min 15KB ".
+ "max 45KB burst 20 avpkt 1000 bandwidth $linerate DP $drop ".
+ "probability $prob \n";
+ }
+ }
+}
+#
+#
+print "\n#------BE Queue setup------\n";
+print "$TC filter add $DEV parent 1:0 protocol ip prio 2 ".
+ "handle 0 tcindex mask 0 classid 1:1\n";
+print "$TC class add $DEV parent 2:0 classid 2:$beclass cbq ".
+ "bandwidth $linerate rate $berate avpkt 1000 prio 6 " .
+ "bounded allot 1514 weight 1 maxburst 21 \n";
+print "$TC filter add $DEV parent 2:0 protocol ip prio 1 handle 0 tcindex ".
+ "classid 2:5\n";
+print "$TC qdisc add $DEV parent 2:5 red limit 60KB min 15KB max 45KB ".
+ "burst 20 avpkt 1000 bandwidth $linerate probability 0.4\n";
diff --git a/examples/diffserv/ef-prio b/examples/diffserv/ef-prio
index e69de29b..48611bdd 100644
--- a/examples/diffserv/ef-prio
+++ b/examples/diffserv/ef-prio
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc";
+$DEV = "dev eth1";
+$efrate="1.5Mbit";
+$MTU="1.5kB";
+print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
+print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ".
+ "mask 0xfc shift 2\n";
+print "$TC qdisc add $DEV parent 1:0 handle 2:0 prio\n";
+#
+# EF class: Maximum about one MTU sized packet allowed on the queue
+#
+print "$TC qdisc add $DEV parent 2:1 tbf rate $efrate burst $MTU limit 1.6kB\n";
+print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ".
+ "handle 0x2e tcindex classid 2:1 pass_on\n";
+#
+# BE class
+#
+print "#BE class(2:2) \n";
+print "$TC qdisc add $DEV parent 2:2 red limit 60KB ".
+ "min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ".
+ "probability 0.4\n";
+#
+print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ".
+ "handle 0 tcindex mask 0 classid 2:2 pass_on\n";
diff --git a/examples/diffserv/efcbq b/examples/diffserv/efcbq
index e69de29b..bcc437b3 100644
--- a/examples/diffserv/efcbq
+++ b/examples/diffserv/efcbq
@@ -0,0 +1,31 @@
+#!/usr/bin/perl
+#
+$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc";
+$DEV = "dev eth1";
+print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
+print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ".
+ "mask 0xfc shift 2\n";
+print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth ".
+ "10Mbit cell 8 avpkt 1000 mpu 64\n";
+#
+# EF class
+#
+print "$TC class add $DEV parent 2:0 classid 2:1 cbq bandwidth ".
+ "10Mbit rate 1500Kbit avpkt 1000 prio 1 bounded isolated ".
+ "allot 1514 weight 1 maxburst 10 \n";
+# packet fifo for EF?
+print "$TC qdisc add $DEV parent 2:1 pfifo limit 5\n";
+print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ".
+ "handle 0x2e tcindex classid 2:1 pass_on\n";
+#
+# BE class
+#
+print "#BE class(2:2) \n";
+print "$TC class add $DEV parent 2:0 classid 2:2 cbq bandwidth ".
+ "10Mbit rate 5Mbit avpkt 1000 prio 7 allot 1514 weight 1 ".
+ "maxburst 21 borrow split 2:0 defmap 0xffff \n";
+print "$TC qdisc add $DEV parent 2:2 red limit 60KB ".
+ "min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ".
+ "probability 0.4\n";
+print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ".
+ "handle 0 tcindex mask 0 classid 2:2 pass_on\n";
diff --git a/examples/diffserv/regression-testing b/examples/diffserv/regression-testing
index e69de29b..0ec705c0 100644
--- a/examples/diffserv/regression-testing
+++ b/examples/diffserv/regression-testing
@@ -0,0 +1,125 @@
+
+These were the tests done to validate the Diffserv scripts.
+This document will be updated continously. If you do more
+thorough validation testing please post the details to the
+diffserv mailing list.
+Nevertheless, these tests should serve for basic validation.
+
+AFCBQ, EFCBQ, EFPRIO
+----------------------
+
+generate all possible DSCPs and observe that they
+get sent to the proper classes. In the case of AF also
+to the correct Virtual Queues.
+
+Edge1
+-----
+generate TOS values 0x0,0x10,0xbb each with IP addresses
+10.2.0.24 (mark 1), 10.2.0.3 (mark2) and 10.2.0.30 (mark 3)
+and observe that they get marked as expected.
+
+Edge2
+-----
+
+-Repeat the tests in Edge1
+-ftp with data direction from 10.2.0.2
+ *observe that the metering/policing works correctly (and the marking
+ as well). In this case the mark used will be 3
+
+Edge31-cb-chains
+----------------
+
+-ftp with data direction from 10.2.0.2
+
+ *observe that the metering/policing works correctly (and the marking
+ as well). In this case the mark used will be 1.
+
+ Metering: The data throughput should not exceed 2*CIR1 + 2*CIR2
+ which is roughly: 5mbps
+
+ Marking: the should be a variation of marked packets:
+ AF41(TOS=0x88) AF42(0x90) AF43(0x98) and BE (0x0)
+
+More tests required to see the interaction of several sources (other
+than subnet 10.2.0.0/24).
+
+Edge31-ca-u32
+--------------
+
+Generate data using modified tcpblast from 10.2.0.2 (behind eth2) to the
+discard port of 10.1.0.2 (behind eth1)
+
+1) generate with src tos = 0x88
+ Metering: Allocated throughput should not exceed 2*CIR1 + 2*CIR2
+ approximately 5mbps
+ Marking: Should vary between 0x88,0x90,0x98 and 0x0
+
+2) generate with src tos = 0x90
+ Metering: Allocated throughput should not exceed CIR1 + 2*CIR2
+ approximately 3.5mbps
+ Marking: Should vary between 0x90,0x98 and 0x0
+
+3) generate with src tos = 0x98
+ Metering: Allocated throughput should not exceed CIR1 + CIR2
+ approximately 2.5mbps
+ Marking: Should vary between 0x98 and 0x0
+
+4) generate with src tos any other than the above
+ Metering: Allocated throughput should not exceed CIR1
+ approximately 1.5mbps
+ Marking: Should be consistent at 0x0
+
+TODO: Testing on how each color shares when all 4 types of packets
+are going through the edge device
+
+Edge32-cb-u32, Edge32-cb-chains
+-------------------------------
+
+-ftp with data direction from 10.2.0.2
+
+ *observe that the metering/policing works correctly (and the marking
+ as well).
+
+ Metering:
+ The data throughput should not exceed 2*CIR1 + 2*CIR2
+ + 2*PIR2 + PIR1 for u32 which is roughly: 6mbps
+ The data throughput should not exceed 2*CIR1 + 5*CIR2
+ for chains which is roughly: 6mbps
+
+ Marking: the should be a variation of marked packets:
+ AF41(TOS=0x88) AF42(0x90) AF43(0x98) and BE (0x0)
+
+TODO:
+-More tests required to see the interaction of several sources (other
+than subnet 10.2.0.0/24).
+-More tests needed to capture stats on how many times the CIR was exceeded
+but the data was not remarked etc.
+
+Edge32-ca-u32
+--------------
+
+Generate data using modified tcpblast from 10.2.0.2 (behind eth2) to the
+discard port of 10.1.0.2 (behind eth1)
+
+1) generate with src tos = 0x88
+ Metering: Allocated throughput should not exceed 2*CIR1 + 2*CIR2
+ +PIR1 -- approximately 4mbps
+ Marking: Should vary between 0x88,0x90,0x98 and 0x0
+
+2) generate with src tos = 0x90
+ Metering: Allocated throughput should not exceed CIR1 + 2*CIR2
+ + 2* PIR2 approximately 3mbps
+ Marking: Should vary between 0x90,0x98 and 0x0
+
+3) generate with src tos = 0x98
+ Metering: Allocated throughput should not exceed PIR1+ CIR1 + CIR2
+ approximately 2.5mbps
+ Marking: Should vary between 0x98 and 0x0
+
+4) generate with src tos any other than the above
+ Metering: Allocated throughput should not exceed CIR1
+ approximately 1mbps
+ Marking: Should be consistent at 0x0
+
+TODO: Testing on how each color shares when all 4 types of packets
+are going through the edge device
diff --git a/include-glibc/bits/sockunion.h b/include-glibc/bits/sockunion.h
index e69de29b..b83add82 100644
--- a/include-glibc/bits/sockunion.h
+++ b/include-glibc/bits/sockunion.h
@@ -0,0 +1,25 @@
+
+/* I cannot describe, how I laughed, when saw, that now sys/socket.h
+ includes ALL OF networking include files. 8)8)8)
+
+ Bravo! Aah, they forgot sockaddr_ll, sockaddr_pkt and sockaddr_nl...
+ Not a big problem, we only start the way to single UNIVERSAL include file:
+
+ #include <GNU-Gnu_is_Not_Unix.h>.
+
+ Jokes apart, it is full crap. Removed.
+ --ANK
+
+ */
+
+/* Union of all sockaddr types (required by IPv6 Basic API). This is
+ somewhat evil. */
+/* 8)8) Well, ipngwg really does strange things sometimes, but
+ not in such extent! It is removed long ago --ANK
+ */
+
+union sockaddr_union
+ {
+ struct sockaddr sa;
+ char __maxsize[128];
+ };
diff --git a/include-glibc/db.h b/include-glibc/db.h
index e69de29b..296584c2 100644
--- a/include-glibc/db.h
+++ b/include-glibc/db.h
@@ -0,0 +1,10 @@
+/* Mess with various libdb in various glibcs is something...
+ * Crooked hands of hackers can result in amazing results making
+ * incompatibility at all the levels without any reasons.
+ *
+ * The simplest trick which I was able to invent is to write fake
+ * db.h including db_185.h and adding -I/usr/include/db3 to CFLAGS.
+ * Looks ugly but compiles everywhere.
+ */
+
+#include <db_185.h>
diff --git a/include-glibc/glibc-bugs.h b/include-glibc/glibc-bugs.h
index e69de29b..65e3d8ad 100644
--- a/include-glibc/glibc-bugs.h
+++ b/include-glibc/glibc-bugs.h
@@ -0,0 +1,20 @@
+#ifndef __GLIBC_BUGS_H__
+#define __GLIBC_BUGS_H__ 1
+
+#include <features.h>
+#include <sys/types.h>
+
+#if defined(__GLIBC__) && __GLIBC__ >= 2
+
+#ifndef __KERNEL_STRICT_NAMES
+#define __KERNEL_STRICT_NAMES 1
+#endif
+
+#include <linux/types.h>
+
+typedef __u16 in_port_t;
+typedef __u32 in_addr_t;
+
+#endif
+
+#endif
diff --git a/include-glibc/netinet/in.h b/include-glibc/netinet/in.h
index e69de29b..784a66ca 100644
--- a/include-glibc/netinet/in.h
+++ b/include-glibc/netinet/in.h
@@ -0,0 +1,11 @@
+#ifndef _NETINET_IN_H
+#define _NETINET_IN_H 1
+
+#include "glibc-bugs.h"
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <linux/in.h>
+
+#define SOL_IP 0
+
+#endif /* netinet/in.h */
diff --git a/include-glibc/netinet/ip.h b/include-glibc/netinet/ip.h
index e69de29b..8812e676 100644
--- a/include-glibc/netinet/ip.h
+++ b/include-glibc/netinet/ip.h
@@ -0,0 +1,9 @@
+#ifndef __NETINET_IP_H
+#define __NETINET_IP_H 1
+
+#include <glibc-bugs.h>
+#include <netinet/in.h>
+
+#include <linux/ip.h>
+
+#endif /* netinet/ip.h */
diff --git a/include-glibc/socketbits.h b/include-glibc/socketbits.h
index e69de29b..5421d6b8 100644
--- a/include-glibc/socketbits.h
+++ b/include-glibc/socketbits.h
@@ -0,0 +1,270 @@
+/* System-specific socket constants and types. Linux version.
+ Copyright (C) 1991, 92, 94, 95, 96, 97, 98 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+#ifndef _SOCKETBITS_H
+
+#define _SOCKETBITS_H 1
+#include <features.h>
+
+#define __need_size_t
+#define __need_NULL
+#include <stddef.h>
+
+
+__BEGIN_DECLS
+
+/* Type for length arguments in socket calls. */
+#ifndef __socklen_t_defined
+typedef unsigned int socklen_t;
+# define __socklen_t_defined
+#endif
+
+/* Types of sockets. */
+enum __socket_type
+{
+ SOCK_STREAM = 1, /* Sequenced, reliable, connection-based
+ byte streams. */
+#define SOCK_STREAM SOCK_STREAM
+ SOCK_DGRAM = 2, /* Connectionless, unreliable datagrams
+ of fixed maximum length. */
+#define SOCK_DGRAM SOCK_DGRAM
+ SOCK_RAW = 3, /* Raw protocol interface. */
+#define SOCK_RAW SOCK_RAW
+ SOCK_RDM = 4, /* Reliably-delivered messages. */
+#define SOCK_RDM SOCK_RDM
+ SOCK_SEQPACKET = 5, /* Sequenced, reliable, connection-based,
+ datagrams of fixed maximum length. */
+#define SOCK_SEQPACKET SOCK_SEQPACKET
+ SOCK_PACKET = 10 /* Linux specific way of getting packets
+ at the dev level. For writing rarp and
+ other similar things on the user level. */
+#define SOCK_PACKET SOCK_PACKET
+};
+
+/* Protocol families. */
+#define PF_UNSPEC 0 /* Unspecified. */
+#define PF_LOCAL 1 /* Local to host (pipes and file-domain). */
+#define PF_UNIX PF_LOCAL /* Old BSD name for PF_LOCAL. */
+#define PF_FILE PF_LOCAL /* POSIX name for PF_LOCAL. */
+#define PF_INET 2 /* IP protocol family. */
+#define PF_AX25 3 /* Amateur Radio AX.25. */
+#define PF_IPX 4 /* Novell Internet Protocol. */
+#define PF_APPLETALK 5 /* Don't use this. */
+#define PF_NETROM 6 /* Amateur radio NetROM. */
+#define PF_BRIDGE 7 /* Multiprotocol bridge. */
+#define PF_AAL5 8 /* Reserved for Werner's ATM. */
+#define PF_X25 9 /* Reserved for X.25 project. */
+#define PF_INET6 10 /* IP version 6. */
+#define PF_ROSE 11 /* Amateur Radio X.25 PLP */
+#define PF_DECnet 12 /* Reserved for DECnet project */
+#define PF_NETBEUI 13 /* Reserved for 802.2LLC project*/
+#define PF_SECURITY 14 /* Security callback pseudo AF */
+#define PF_KEY 15 /* PF_KEY key management API */
+#define PF_NETLINK 16
+#define PF_ROUTE PF_NETLINK /* Alias to emulate 4.4BSD */
+#define PF_PACKET 17 /* Packet family */
+#define PF_MAX 32 /* For now.. */
+
+/* Address families. */
+#define AF_UNSPEC PF_UNSPEC
+#define AF_LOCAL PF_LOCAL
+#define AF_UNIX PF_UNIX
+#define AF_FILE PF_FILE
+#define AF_INET PF_INET
+#define AF_AX25 PF_AX25
+#define AF_IPX PF_IPX
+#define AF_APPLETALK PF_APPLETALK
+#define AF_NETROM PF_NETROM
+#define AF_BRIDGE PF_BRIDGE
+#define AF_AAL5 PF_AAL5
+#define AF_X25 PF_X25
+#define AF_INET6 PF_INET6
+#define AF_ROSE PF_ROSE
+#define AF_DECnet PF_DECnet
+#define AF_NETBEUI PF_NETBEUI
+#define AF_SECURITY PF_SECURITY
+#define pseudo_AF_KEY pseudo_PF_KEY
+#define AF_NETLINK PF_NETLINK
+#define AF_ROUTE PF_ROUTE
+#define AF_PACKET PF_PACKET
+#define AF_MAX PF_MAX
+
+/* Socket level values. Others are defined in the appropriate headers.
+
+ XXX These definitions also should go into the appropriate headers as
+ far as they are available. */
+#define SOL_IPV6 41
+#define SOL_ICMPV6 58
+#define SOL_RAW 255
+#define SOL_AX25 257
+#define SOL_ATALK 258
+#define SOL_NETROM 259
+#define SOL_ROSE 260
+#define SOL_DECNET 261
+#define SOL_X25 262
+
+/* Maximum queue length specifiable by listen. */
+#define SOMAXCONN 128
+
+/* Get the definition of the macro to define the common sockaddr members. */
+#if __GLIBC_MINOR__ >= 1
+#include <bits/sockaddr.h>
+#else
+#include <sockaddrcom.h>
+#endif
+
+/* Structure describing a generic socket address. */
+struct sockaddr
+ {
+ __SOCKADDR_COMMON (sa_); /* Common data: address family and length. */
+ char sa_data[14]; /* Address data. */
+ };
+
+
+/* Bits in the FLAGS argument to `send', `recv', et al. */
+enum
+ {
+ MSG_OOB = 0x01, /* Process out-of-band data. */
+#define MSG_OOB MSG_OOB
+ MSG_PEEK = 0x02, /* Peek at incoming messages. */
+#define MSG_PEEK MSG_PEEK
+ MSG_DONTROUTE = 0x04, /* Don't use local routing. */
+#define MSG_DONTROUTE MSG_DONTROUTE
+ MSG_CTRUNC = 0x08, /* Control data lost before delivery. */
+#define MSG_CTRUNC MSG_CTRUNC
+ MSG_PROXY = 0x10, /* Supply or ask second address. */
+#define MSG_PROXY MSG_PROXY
+ MSG_TRUNC = 0x20,
+#define MSG_TRUNC MSG_TRUNC
+ MSG_DONTWAIT = 0x40,
+#define MSG_DONTWAIT MSG_DONTWAIT
+ MSG_WAITALL = 0x100,
+#define MSG_WAITALL MSG_WAITALL
+ MSG_ERRQUEUE = 0x2000,
+#define MSG_ERRQUEUE MSG_ERRQUEUE
+ MSG_NOSIGNAL = 0x4000,
+#define MSG_NOSIGNAL MSG_NOSIGNAL
+ };
+
+
+/* Structure describing messages sent by
+ `sendmsg' and received by `recvmsg'. */
+struct msghdr
+ {
+ __ptr_t msg_name; /* Address to send to/receive from. */
+ socklen_t msg_namelen; /* Length of address data. */
+
+ struct iovec *msg_iov; /* Vector of data to send/receive into. */
+ size_t msg_iovlen; /* Number of elements in the vector. */
+
+ __ptr_t msg_control; /* Ancillary data (eg BSD filedesc passing). */
+ size_t msg_controllen; /* Ancillary data buffer length. */
+
+ int msg_flags; /* Flags on received message. */
+ };
+
+/* Structure used for storage of ancillary data object information. */
+struct cmsghdr
+ {
+ size_t cmsg_len; /* Length of data in cmsg_data plus length
+ of cmsghdr structure. */
+ int cmsg_level; /* Originating protocol. */
+ int cmsg_type; /* Protocol specific type. */
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && __GNUC__ >= 2
+ unsigned char __cmsg_data[0]; /* Ancillary data. */
+#endif
+ };
+
+/* Ancillary data object manipulation macros. */
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && __GNUC__ >= 2
+# define CMSG_DATA(cmsg) ((cmsg)->__cmsg_data)
+#else
+# define CMSG_DATA(cmsg) ((unsigned char *) ((struct cmsghdr *) (cmsg) + 1))
+#endif
+#define CMSG_NXTHDR(mhdr, cmsg) __cmsg_nxthdr (mhdr, cmsg)
+#define CMSG_FIRSTHDR(mhdr) \
+ ((size_t) (mhdr)->msg_controllen >= sizeof (struct cmsghdr) \
+ ? (struct cmsghdr *) (mhdr)->msg_control : (struct cmsghdr *) NULL)
+#define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) )
+#define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len))
+#define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len))
+
+
+#ifndef _EXTERN_INLINE
+# define _EXTERN_INLINE extern __inline
+#endif
+extern struct cmsghdr *__cmsg_nxthdr __P ((struct msghdr *__mhdr,
+ struct cmsghdr *__cmsg));
+_EXTERN_INLINE struct cmsghdr *
+__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg)
+{
+ if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
+ /* The kernel header does this so there may be a reason. */
+ return NULL;
+
+ __cmsg = (struct cmsghdr *)
+ ((unsigned char *) __cmsg + CMSG_ALIGN(__cmsg->cmsg_len));
+
+ if ( (unsigned char *) (__cmsg + 1) >=
+ (unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)
+ /* No more entries. */
+ return NULL;
+ return __cmsg;
+}
+
+/* Socket level message types. This must match the definitions in
+ <linux/socket.h>. */
+enum
+ {
+ SCM_RIGHTS = 0x01, /* Data array contains access rights. */
+#define SCM_RIGHTS SCM_RIGHTS
+ SCM_CREDENTIALS = 0x02, /* Data array is `struct ucred'. */
+#define SCM_CREDENTIALS SCM_CREDENTIALS
+ };
+
+
+
+/* Get socket manipulation related informations from kernel headers. */
+#ifdef THIS_IS_CRAP
+#ifndef _LINUX_TYPES_H
+# define _LINUX_TYPES_H
+#endif
+#endif
+
+#include <asm/socket.h>
+#include <asm/types.h>
+
+struct ucred
+{
+ __u32 pid;
+ __u32 uid;
+ __u32 gid;
+};
+
+
+/* Structure used to manipulate the SO_LINGER option. */
+struct linger
+ {
+ int l_onoff; /* Nonzero to linger on close. */
+ int l_linger; /* Time to linger. */
+ };
+
+__END_DECLS
+
+#endif /* socketbits.h */
diff --git a/include/SNAPSHOT.h b/include/SNAPSHOT.h
index e69de29b..e8107edf 100644
--- a/include/SNAPSHOT.h
+++ b/include/SNAPSHOT.h
@@ -0,0 +1 @@
+static char SNAPSHOT[] = "020116";
diff --git a/include/libnetlink.h b/include/libnetlink.h
index e69de29b..45d3ad2b 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -0,0 +1,46 @@
+#ifndef __LIBNETLINK_H__
+#define __LIBNETLINK_H__ 1
+
+#include <asm/types.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+struct rtnl_handle
+{
+ int fd;
+ struct sockaddr_nl local;
+ struct sockaddr_nl peer;
+ __u32 seq;
+ __u32 dump;
+};
+
+extern int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions);
+extern void rtnl_close(struct rtnl_handle *rth);
+extern int rtnl_wilddump_request(struct rtnl_handle *rth, int fam, int type);
+extern int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len);
+extern int rtnl_dump_filter(struct rtnl_handle *rth,
+ int (*filter)(struct sockaddr_nl *, struct nlmsghdr *n, void *),
+ void *arg1,
+ int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *arg2);
+extern int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer,
+ unsigned groups, struct nlmsghdr *answer,
+ int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *jarg);
+extern int rtnl_send(struct rtnl_handle *rth, char *buf, int);
+
+
+extern int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data);
+extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, void *data, int alen);
+extern int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data);
+extern int rta_addattr_l(struct rtattr *rta, int maxlen, int type, void *data, int alen);
+
+extern int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len);
+
+extern int rtnl_listen(struct rtnl_handle *, int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *jarg);
+extern int rtnl_from_file(FILE *, int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *jarg);
+
+#endif /* __LIBNETLINK_H__ */
+
diff --git a/include/ll_map.h b/include/ll_map.h
index e69de29b..739f157e 100644
--- a/include/ll_map.h
+++ b/include/ll_map.h
@@ -0,0 +1,12 @@
+#ifndef __LL_MAP_H__
+#define __LL_MAP_H__ 1
+
+extern int ll_remember_index(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg);
+extern int ll_init_map(struct rtnl_handle *rth);
+extern int ll_name_to_index(char *name);
+extern const char *ll_index_to_name(int idx);
+extern const char *ll_idx_n2a(int idx, char *buf);
+extern int ll_index_to_type(int idx);
+extern unsigned ll_index_to_flags(int idx);
+
+#endif /* __LL_MAP_H__ */
diff --git a/include/rt_names.h b/include/rt_names.h
index e69de29b..6ac29204 100644
--- a/include/rt_names.h
+++ b/include/rt_names.h
@@ -0,0 +1,28 @@
+#ifndef RT_NAMES_H_
+#define RT_NAMES_H_ 1
+
+const char* rtnl_rtprot_n2a(int id, char *buf, int len);
+const char* rtnl_rtscope_n2a(int id, char *buf, int len);
+const char* rtnl_rttable_n2a(int id, char *buf, int len);
+const char* rtnl_rtrealm_n2a(int id, char *buf, int len);
+const char* rtnl_dsfield_n2a(int id, char *buf, int len);
+int rtnl_rtprot_a2n(int *id, char *arg);
+int rtnl_rtscope_a2n(int *id, char *arg);
+int rtnl_rttable_a2n(int *id, char *arg);
+int rtnl_rtrealm_a2n(__u32 *id, char *arg);
+int rtnl_dsfield_a2n(__u32 *id, char *arg);
+
+const char *inet_proto_n2a(int proto, char *buf, int len);
+int inet_proto_a2n(char *buf);
+
+
+const char * ll_type_n2a(int type, char *buf, int len);
+
+const char *ll_addr_n2a(unsigned char *addr, int alen, int type, char *buf, int blen);
+int ll_addr_a2n(unsigned char *lladdr, int len, char *arg);
+
+const char * ll_proto_n2a(unsigned short id, char *buf, int len);
+int ll_proto_a2n(unsigned short *id, char *buf);
+
+
+#endif
diff --git a/include/rtm_map.h b/include/rtm_map.h
index e69de29b..70bda7d0 100644
--- a/include/rtm_map.h
+++ b/include/rtm_map.h
@@ -0,0 +1,10 @@
+#ifndef __RTM_MAP_H__
+#define __RTM_MAP_H__ 1
+
+char *rtnl_rtntype_n2a(int id, char *buf, int len);
+int rtnl_rtntype_a2n(int *id, char *arg);
+
+int get_rt_realms(__u32 *realms, char *arg);
+
+
+#endif /* __RTM_MAP_H__ */
diff --git a/include/tcp_diag.h b/include/tcp_diag.h
index e69de29b..23014df3 100644
--- a/include/tcp_diag.h
+++ b/include/tcp_diag.h
@@ -0,0 +1,119 @@
+#ifndef _TCP_DIAG_H_
+#define _TCP_DIAG_H_ 1
+
+/* Replace with dymanically allocated value */
+#define NETLINK_TCPDIAG 4
+
+/* Just some random number */
+#define TCPDIAG_GETSOCK 18
+
+/* Socket identity */
+struct tcpdiag_sockid
+{
+ __u16 tcpdiag_sport;
+ __u16 tcpdiag_dport;
+ __u32 tcpdiag_src[4];
+ __u32 tcpdiag_dst[4];
+ __u32 tcpdiag_if;
+ __u32 tcpdiag_cookie[2];
+#define TCPDIAG_NOCOOKIE (~0U)
+};
+
+/* Request structure */
+
+struct tcpdiagreq
+{
+ __u8 tcpdiag_family; /* Family of addresses. */
+ __u8 tcpdiag_src_len;
+ __u8 tcpdiag_dst_len;
+ __u8 tcpdiag_ext; /* Query extended information */
+
+ struct tcpdiag_sockid id;
+
+ __u32 tcpdiag_states; /* States to dump */
+ __u32 tcpdiag_dbs; /* Tables to dump (NI) */
+};
+
+enum
+{
+ TCPDIAG_REQ_NONE,
+ TCPDIAG_REQ_BYTECODE,
+};
+
+#define TCPDIAG_REQ_MAX TCPDIAG_REQ_BYTECODE
+
+/* Bytecode is sequence of 4 byte commands followed by variable arguments.
+ * All the commands identified by "code" are conditional jumps forward:
+ * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be
+ * length of the command and its arguments.
+ */
+
+struct tcpdiag_bc_op
+{
+ unsigned char code;
+ unsigned char yes;
+ unsigned short no;
+};
+
+enum
+{
+ TCPDIAG_BC_NOP,
+ TCPDIAG_BC_JMP,
+ TCPDIAG_BC_S_GE,
+ TCPDIAG_BC_S_LE,
+ TCPDIAG_BC_D_GE,
+ TCPDIAG_BC_D_LE,
+ TCPDIAG_BC_AUTO,
+ TCPDIAG_BC_S_COND,
+ TCPDIAG_BC_D_COND,
+};
+
+struct tcpdiag_hostcond
+{
+ __u8 family;
+ __u8 prefix_len;
+ int port;
+ __u32 addr[0];
+};
+
+/* Base info structure. It contains socket identity (addrs/ports/cookie)
+ * and, alas, the information shown by netstat. */
+struct tcpdiagmsg
+{
+ __u8 tcpdiag_family;
+ __u8 tcpdiag_state;
+ __u8 tcpdiag_timer;
+ __u8 tcpdiag_retrans;
+
+ struct tcpdiag_sockid id;
+
+ __u32 tcpdiag_expires;
+ __u32 tcpdiag_rqueue;
+ __u32 tcpdiag_wqueue;
+ __u32 tcpdiag_uid;
+ __u32 tcpdiag_inode;
+};
+
+/* Extensions */
+
+enum
+{
+ TCPDIAG_NONE,
+ TCPDIAG_MEMINFO,
+ TCPDIAG_INFO,
+};
+
+#define TCPDIAG_MAX TCPDIAG_INFO
+
+
+/* TCPDIAG_MEM */
+
+struct tcpdiag_meminfo
+{
+ __u32 tcpdiag_rmem;
+ __u32 tcpdiag_wmem;
+ __u32 tcpdiag_fmem;
+ __u32 tcpdiag_tmem;
+};
+
+#endif /* _TCP_DIAG_H_ */
diff --git a/include/utils.h b/include/utils.h
index e69de29b..e9ba5a38 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -0,0 +1,104 @@
+#ifndef __UTILS_H__
+#define __UTILS_H__ 1
+
+#include <asm/types.h>
+#include <resolv.h>
+
+#include "libnetlink.h"
+#include "ll_map.h"
+#include "rtm_map.h"
+
+extern int preferred_family;
+extern int show_stats;
+extern int show_details;
+extern int show_raw;
+extern int resolve_hosts;
+extern int oneline;
+extern char * _SL_;
+
+#ifndef IPPROTO_ESP
+#define IPPROTO_ESP 50
+#endif
+#ifndef IPPROTO_AH
+#define IPPROTO_AH 51
+#endif
+
+#define SPRINT_BSIZE 64
+#define SPRINT_BUF(x) char x[SPRINT_BSIZE]
+
+extern void incomplete_command(void) __attribute__((noreturn));
+
+#define NEXT_ARG() do { argv++; if (--argc <= 0) incomplete_command(); } while(0)
+
+typedef struct
+{
+ __u8 family;
+ __u8 bytelen;
+ __s16 bitlen;
+ __u32 data[4];
+} inet_prefix;
+
+#define DN_MAXADDL 20
+#ifndef AF_DECnet
+#define AF_DECnet 12
+#endif
+
+struct dn_naddr
+{
+ unsigned short a_len;
+ unsigned char a_addr[DN_MAXADDL];
+};
+
+#define IPX_NODE_LEN 6
+
+struct ipx_addr {
+ u_int32_t ipx_net;
+ u_int8_t ipx_node[IPX_NODE_LEN];
+};
+
+extern __u32 get_addr32(char *name);
+extern int get_addr_1(inet_prefix *dst, char *arg, int family);
+extern int get_prefix_1(inet_prefix *dst, char *arg, int family);
+extern int get_addr(inet_prefix *dst, char *arg, int family);
+extern int get_prefix(inet_prefix *dst, char *arg, int family);
+
+extern int get_integer(int *val, char *arg, int base);
+extern int get_unsigned(unsigned *val, char *arg, int base);
+#define get_byte get_u8
+#define get_ushort get_u16
+#define get_short get_s16
+extern int get_u32(__u32 *val, char *arg, int base);
+extern int get_u16(__u16 *val, char *arg, int base);
+extern int get_s16(__s16 *val, char *arg, int base);
+extern int get_u8(__u8 *val, char *arg, int base);
+extern int get_s8(__s8 *val, char *arg, int base);
+
+extern __u8* hexstring_n2a(const __u8 *str, int len, __u8 *buf, int blen);
+extern __u8* hexstring_a2n(const __u8 *str, __u8 *buf, int blen);
+
+extern const char *format_host(int af, int len, void *addr, char *buf, int buflen);
+extern const char *rt_addr_n2a(int af, int len, void *addr, char *buf, int buflen);
+
+void invarg(char *, char *) __attribute__((noreturn));
+void duparg(char *, char *) __attribute__((noreturn));
+void duparg2(char *, char *) __attribute__((noreturn));
+int matches(char *arg, char *pattern);
+extern int inet_addr_match(inet_prefix *a, inet_prefix *b, int bits);
+
+const char *dnet_ntop(int af, const void *addr, char *str, size_t len);
+int dnet_pton(int af, const char *src, void *addr);
+
+const char *ipx_ntop(int af, const void *addr, char *str, size_t len);
+int ipx_pton(int af, const char *src, void *addr);
+
+extern int __iproute2_hz_internal;
+extern int __get_hz(void);
+
+static __inline__ int get_hz(void)
+{
+ if (__iproute2_hz_internal == 0)
+ __iproute2_hz_internal = __get_hz();
+ return __iproute2_hz_internal;
+}
+
+#endif /* __UTILS_H__ */
diff --git a/ip/Makefile b/ip/Makefile
index e69de29b..2aa00518 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -0,0 +1,22 @@
+IPOBJ=ip.o ipaddress.o iproute.o iprule.o \
+ rtm_map.o iptunnel.o ipneigh.o iplink.o ipmaddr.o \
+ ipmonitor.o ipmroute.o
+
+RTMONOBJ=rtmon.o
+
+ALLOBJ=$(IPOBJ) $(RTMONOBJ)
+TARGETS=ip rtmon
+
+all: $(TARGETS)
+
+ip: $(IPOBJ) $(LIBNETLINK) $(LIBUTIL)
+
+rtmon: $(RTMONOBJ) $(LIBNETLINK)
+
+install: all
+ install -m 0755 -s $(TARGETS) $(DESTDIR)$(SBINDIR)
+ install -m 0755 routel routef $(DESTDIR)$(SBINDIR)
+
+clean:
+ rm -f $(ALLOBJ) $(TARGETS)
+
diff --git a/ip/ifcfg b/ip/ifcfg
index e69de29b..ed6960f7 100644
--- a/ip/ifcfg
+++ b/ip/ifcfg
@@ -0,0 +1,145 @@
+#! /bin/bash
+
+CheckForwarding () {
+ local sbase fwd
+ sbase=/proc/sys/net/ipv4/conf
+ fwd=0
+ if [ -d $sbase ]; then
+ for dir in $sbase/*/forwarding; do
+ fwd=$[$fwd + `cat $dir`]
+ done
+ else
+ fwd=2
+ fi
+ return $fwd
+}
+
+RestartRDISC () {
+ killall -HUP rdisc || rdisc -fs
+}
+
+ABCMaskLen () {
+ local class;
+
+ class=${1%%.*}
+ if [ "$1" = "" -o $class -eq 0 -o $class -ge 224 ]; then return 0
+ elif [ $class -ge 224 ]; then return 0
+ elif [ $class -ge 192 ]; then return 24
+ elif [ $class -ge 128 ]; then return 16
+ else return 8; fi
+}
+
+label="label $1"
+ldev="$1"
+dev=${1%:*}
+if [ "$dev" = "" -o "$1" = "help" ]; then
+ echo "Usage: ifcfg DEV [[add|del [ADDR[/LEN]] [PEER] | stop]" 1>&2
+ echo " add - add new address" 1>&2
+ echo " del - delete address" 1>&2
+ echo " stop - completely disable IP" 1>&2
+ exit 1
+fi
+shift
+
+CheckForwarding
+fwd=$?
+if [ $fwd -ne 0 ]; then
+ echo "Forwarding is ON or its state is unknown ($fwd). OK, No RDISC." 1>&2
+fi
+
+
+deleting=0
+case "$1" in
+add) shift ;;
+stop)
+ if [ "$ldev" != "$dev" ]; then
+ echo "Cannot stop alias $ldev" 1>&2
+ exit 1;
+ fi
+ ip -4 addr flush dev $dev $label || exit 1
+ if [ $fwd -eq 0 ]; then RestartRDISC; fi
+ exit 0 ;;
+del*)
+ deleting=1; shift ;;
+*)
+esac
+
+ipaddr=
+pfxlen=
+if [ "$1" != "" ]; then
+ ipaddr=${1%/*}
+ if [ "$1" != "$ipaddr" ]; then
+ pfxlen=${1#*/}
+ fi
+ if [ "$ipaddr" = "" ]; then
+ echo "$1 is bad IP address." 1>&2
+ exit 1
+ fi
+fi
+shift
+
+peer=$1
+if [ "$peer" != "" ]; then
+ if [ "$pfxlen" != "" -a "$pfxlen" != "32" ]; then
+ echo "Peer address with non-trivial netmask." 1>&2
+ exit 1
+ fi
+ pfx="$ipaddr peer $peer"
+else
+ if [ "$pfxlen" = "" ]; then
+ ABCMaskLen $ipaddr
+ pfxlen=$?
+ fi
+ pfx="$ipaddr/$pfxlen"
+fi
+
+if [ "$ldev" = "$dev" -a "$ipaddr" != "" ]; then
+ label=
+fi
+
+if [ $deleting -ne 0 ]; then
+ ip addr del $pfx dev $dev $label || exit 1
+ if [ $fwd -eq 0 ]; then RestartRDISC; fi
+ exit 0
+fi
+
+
+if ! ip link set up dev $dev ; then
+ echo "Error: cannot enable interface $dev." 1>&2
+ exit 1
+fi
+if [ "$ipaddr" = "" ]; then exit 0; fi
+
+if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then
+ echo "Error: some host already uses address $ipaddr on $dev." 1>&2
+ exit 1
+fi
+
+if ! ip address add $pfx brd + dev $dev $label; then
+ echo "Error: failed to add $pfx on $dev." 1>&2
+ exit 1
+fi
+
+arping -q -A -c 1 -I $dev $ipaddr
+noarp=$?
+( sleep 2 ;
+ arping -q -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null &
+
+ip route add unreachable 224.0.0.0/24 >& /dev/null
+ip route add unreachable 255.255.255.255 >& /dev/null
+if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then
+ ip route add 224.0.0.0/4 dev $dev scope global >& /dev/null
+fi
+
+if [ $fwd -eq 0 ]; then
+ if [ $noarp -eq 0 ]; then
+ ip ro append default dev $dev metric 30000 scope global
+ elif [ "$peer" != "" ]; then
+ if ping -q -c 2 -w 4 $peer ; then
+ ip ro append default via $peer dev $dev metric 30001
+ fi
+ fi
+ RestartRDISC
+fi
+
+exit 0
diff --git a/ip/ip.c b/ip/ip.c
index e69de29b..fe379926 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -0,0 +1,167 @@
+/*
+ * ip.c "ip" utility frontend.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *
+ * Changes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <string.h>
+
+#include "SNAPSHOT.h"
+#include "utils.h"
+#include "ip_common.h"
+
+int preferred_family = AF_UNSPEC;
+int show_stats = 0;
+int resolve_hosts = 0;
+int oneline = 0;
+char * _SL_ = NULL;
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr,
+"Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }\n"
+"where OBJECT := { link | addr | route | rule | neigh | tunnel |\n"
+" maddr | mroute | monitor }\n"
+" OPTIONS := { -V[ersion] | -s[tatistics] | -r[esolve] |\n"
+" -f[amily] { inet | inet6 | ipx | dnet | link } | -o[neline] }\n");
+ exit(-1);
+}
+
+int main(int argc, char **argv)
+{
+ char *basename;
+
+ basename = strrchr(argv[0], '/');
+ if (basename == NULL)
+ basename = argv[0];
+ else
+ basename++;
+
+ while (argc > 1) {
+ char *opt = argv[1];
+ if (strcmp(opt,"--") == 0) {
+ argc--; argv++;
+ break;
+ }
+ if (opt[0] != '-')
+ break;
+ if (opt[1] == '-')
+ opt++;
+ if (matches(opt, "-family") == 0) {
+ argc--;
+ argv++;
+ if (argc <= 1)
+ usage();
+ if (strcmp(argv[1], "inet") == 0)
+ preferred_family = AF_INET;
+ else if (strcmp(argv[1], "inet6") == 0)
+ preferred_family = AF_INET6;
+ else if (strcmp(argv[1], "dnet") == 0)
+ preferred_family = AF_DECnet;
+ else if (strcmp(argv[1], "link") == 0)
+ preferred_family = AF_PACKET;
+ else if (strcmp(argv[1], "ipx") == 0)
+ preferred_family = AF_IPX;
+ else if (strcmp(argv[1], "help") == 0)
+ usage();
+ else
+ invarg(argv[1], "invalid protocol family");
+ } else if (strcmp(opt, "-4") == 0) {
+ preferred_family = AF_INET;
+ } else if (strcmp(opt, "-6") == 0) {
+ preferred_family = AF_INET6;
+ } else if (strcmp(opt, "-0") == 0) {
+ preferred_family = AF_PACKET;
+ } else if (strcmp(opt, "-I") == 0) {
+ preferred_family = AF_IPX;
+ } else if (strcmp(opt, "-D") == 0) {
+ preferred_family = AF_DECnet;
+ } else if (matches(opt, "-stats") == 0 ||
+ matches(opt, "-statistics") == 0) {
+ ++show_stats;
+ } else if (matches(opt, "-resolve") == 0) {
+ ++resolve_hosts;
+ } else if (matches(opt, "-oneline") == 0) {
+ ++oneline;
+#if 0
+ } else if (matches(opt, "-numeric") == 0) {
+ rtnl_names_numeric++;
+#endif
+ } else if (matches(opt, "-Version") == 0) {
+ printf("ip utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ } else if (matches(opt, "-help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, "Option \"%s\" is unknown, try \"ip -help\".\n", opt);
+ exit(-1);
+ }
+ argc--; argv++;
+ }
+
+ _SL_ = oneline ? "\\" : "\n" ;
+
+ if (strcmp(basename, "ipaddr") == 0)
+ return do_ipaddr(argc-1, argv+1);
+ if (strcmp(basename, "ipmaddr") == 0)
+ return do_multiaddr(argc-1, argv+1);
+ if (strcmp(basename, "iproute") == 0)
+ return do_iproute(argc-1, argv+1);
+ if (strcmp(basename, "iprule") == 0)
+ return do_iprule(argc-1, argv+1);
+ if (strcmp(basename, "ipneigh") == 0)
+ return do_ipneigh(argc-1, argv+1);
+ if (strcmp(basename, "iplink") == 0)
+ return do_iplink(argc-1, argv+1);
+ if (strcmp(basename, "iptunnel") == 0)
+ return do_iptunnel(argc-1, argv+1);
+ if (strcmp(basename, "ipmonitor") == 0)
+ return do_ipmonitor(argc-1, argv+1);
+
+ if (argc > 1) {
+ if (matches(argv[1], "address") == 0)
+ return do_ipaddr(argc-2, argv+2);
+ if (matches(argv[1], "maddress") == 0)
+ return do_multiaddr(argc-2, argv+2);
+ if (matches(argv[1], "route") == 0)
+ return do_iproute(argc-2, argv+2);
+ if (matches(argv[1], "rule") == 0)
+ return do_iprule(argc-2, argv+2);
+ if (matches(argv[1], "mroute") == 0)
+ return do_multiroute(argc-2, argv+2);
+ if (matches(argv[1], "neighbor") == 0 ||
+ matches(argv[1], "neighbour") == 0)
+ return do_ipneigh(argc-2, argv+2);
+ if (matches(argv[1], "link") == 0)
+ return do_iplink(argc-2, argv+2);
+ if (matches(argv[1], "tunnel") == 0 ||
+ strcmp(argv[1], "tunl") == 0)
+ return do_iptunnel(argc-2, argv+2);
+ if (matches(argv[1], "monitor") == 0)
+ return do_ipmonitor(argc-2, argv+2);
+ if (matches(argv[1], "help") == 0)
+ usage();
+ fprintf(stderr, "Object \"%s\" is unknown, try \"ip help\".\n", argv[1]);
+ exit(-1);
+ }
+ usage();
+}
diff --git a/ip/ip_common.h b/ip/ip_common.h
index e69de29b..5ac43218 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -0,0 +1,20 @@
+extern int print_linkinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg);
+extern int print_addrinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg);
+extern int print_neigh(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg);
+extern int ipaddr_list(int argc, char **argv);
+extern int ipaddr_list_link(int argc, char **argv);
+extern int iproute_monitor(int argc, char **argv);
+extern void iplink_usage(void) __attribute__((noreturn));
+extern void iproute_reset_filter(void);
+extern void ipaddr_reset_filter(int);
+extern void ipneigh_reset_filter(void);
+extern int print_route(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg);
+extern int do_ipaddr(int argc, char **argv);
+extern int do_iproute(int argc, char **argv);
+extern int do_iprule(int argc, char **argv);
+extern int do_ipneigh(int argc, char **argv);
+extern int do_iptunnel(int argc, char **argv);
+extern int do_iplink(int argc, char **argv);
+extern int do_ipmonitor(int argc, char **argv);
+extern int do_multiaddr(int argc, char **argv);
+extern int do_multiroute(int argc, char **argv);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index e69de29b..0d00280c 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -0,0 +1,898 @@
+/*
+ * ipaddress.c "ip address".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Laszlo Valko <valko@linux.karinthy.hu> 990223: address label must be zero terminated
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <fnmatch.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ll_map.h"
+#include "ip_common.h"
+
+static struct
+{
+ int ifindex;
+ int family;
+ int oneline;
+ int showqueue;
+ inet_prefix pfx;
+ int scope, scopemask;
+ int flags, flagmask;
+ int up;
+ char *label;
+ int flushed;
+ char *flushb;
+ int flushp;
+ int flushe;
+ struct rtnl_handle *rth;
+} filter;
+
+static int do_link;
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ if (do_link) {
+ iplink_usage();
+ }
+ fprintf(stderr, "Usage: ip addr {add|del} IFADDR dev STRING\n");
+ fprintf(stderr, " ip addr {show|flush} [ dev STRING ] [ scope SCOPE-ID ]\n");
+ fprintf(stderr, " [ to PREFIX ] [ FLAG-LIST ] [ label PATTERN ]\n");
+ fprintf(stderr, "IFADDR := PREFIX | ADDR peer PREFIX\n");
+ fprintf(stderr, " [ broadcast ADDR ] [ anycast ADDR ]\n");
+ fprintf(stderr, " [ label STRING ] [ scope SCOPE-ID ]\n");
+ fprintf(stderr, "SCOPE-ID := [ host | link | global | NUMBER ]\n");
+ fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n");
+ fprintf(stderr, "FLAG := [ permanent | dynamic | secondary | primary |\n");
+ fprintf(stderr, " tentative | deprecated ]\n");
+ exit(-1);
+}
+
+void print_link_flags(FILE *fp, unsigned flags, unsigned mdown)
+{
+ fprintf(fp, "<");
+ flags &= ~IFF_RUNNING;
+#define _PF(f) if (flags&IFF_##f) { \
+ flags &= ~IFF_##f ; \
+ fprintf(fp, #f "%s", flags ? "," : ""); }
+ _PF(LOOPBACK);
+ _PF(BROADCAST);
+ _PF(POINTOPOINT);
+ _PF(MULTICAST);
+ _PF(NOARP);
+ _PF(ALLMULTI);
+ _PF(PROMISC);
+ _PF(MASTER);
+ _PF(SLAVE);
+ _PF(DEBUG);
+ _PF(DYNAMIC);
+ _PF(AUTOMEDIA);
+ _PF(PORTSEL);
+ _PF(NOTRAILERS);
+ _PF(UP);
+#undef _PF
+ if (flags)
+ fprintf(fp, "%x", flags);
+ if (mdown)
+ fprintf(fp, ",M-DOWN");
+ fprintf(fp, "> ");
+}
+
+void print_queuelen(char *name)
+{
+ struct ifreq ifr;
+ int s;
+
+ s = socket(AF_INET, SOCK_STREAM, 0);
+ if (s < 0)
+ return;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, name);
+ if (ioctl(s, SIOCGIFTXQLEN, &ifr) < 0) {
+ perror("SIOCGIFXQLEN");
+ close(s);
+ return;
+ }
+ close(s);
+
+ if (ifr.ifr_qlen)
+ printf("qlen %d", ifr.ifr_qlen);
+}
+
+int print_linkinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct ifinfomsg *ifi = NLMSG_DATA(n);
+ struct rtattr * tb[IFLA_MAX+1];
+ int len = n->nlmsg_len;
+ unsigned m_flag = 0;
+
+ if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
+ return 0;
+
+ len -= NLMSG_LENGTH(sizeof(*ifi));
+ if (len < 0)
+ return -1;
+
+ if (filter.ifindex && ifi->ifi_index != filter.ifindex)
+ return 0;
+ if (filter.up && !(ifi->ifi_flags&IFF_UP))
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
+ if (tb[IFLA_IFNAME] == NULL) {
+ fprintf(stderr, "BUG: nil ifname\n");
+ return -1;
+ }
+ if (filter.label &&
+ (!filter.family || filter.family == AF_PACKET) &&
+ fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0))
+ return 0;
+
+ if (n->nlmsg_type == RTM_DELLINK)
+ fprintf(fp, "Deleted ");
+
+ fprintf(fp, "%d: %s", ifi->ifi_index,
+ tb[IFLA_IFNAME] ? (char*)RTA_DATA(tb[IFLA_IFNAME]) : "<nil>");
+
+ if (tb[IFLA_LINK]) {
+ SPRINT_BUF(b1);
+ int iflink = *(int*)RTA_DATA(tb[IFLA_LINK]);
+ if (iflink == 0)
+ fprintf(fp, "@NONE: ");
+ else {
+ fprintf(fp, "@%s: ", ll_idx_n2a(iflink, b1));
+ m_flag = ll_index_to_flags(iflink);
+ m_flag = !(m_flag & IFF_UP);
+ }
+ } else {
+ fprintf(fp, ": ");
+ }
+ print_link_flags(fp, ifi->ifi_flags, m_flag);
+
+ if (tb[IFLA_MTU])
+ fprintf(fp, "mtu %u ", *(int*)RTA_DATA(tb[IFLA_MTU]));
+ if (tb[IFLA_QDISC])
+ fprintf(fp, "qdisc %s ", (char*)RTA_DATA(tb[IFLA_QDISC]));
+#ifdef IFLA_MASTER
+ if (tb[IFLA_MASTER]) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "master %s ", ll_idx_n2a(*(int*)RTA_DATA(tb[IFLA_MASTER]), b1));
+ }
+#endif
+ if (filter.showqueue)
+ print_queuelen((char*)RTA_DATA(tb[IFLA_IFNAME]));
+
+ if (!filter.family || filter.family == AF_PACKET) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "%s", _SL_);
+ fprintf(fp, " link/%s ", ll_type_n2a(ifi->ifi_type, b1, sizeof(b1)));
+
+ if (tb[IFLA_ADDRESS]) {
+ fprintf(fp, "%s", ll_addr_n2a(RTA_DATA(tb[IFLA_ADDRESS]),
+ RTA_PAYLOAD(tb[IFLA_ADDRESS]),
+ ifi->ifi_type,
+ b1, sizeof(b1)));
+ }
+ if (tb[IFLA_BROADCAST]) {
+ if (ifi->ifi_flags&IFF_POINTOPOINT)
+ fprintf(fp, " peer ");
+ else
+ fprintf(fp, " brd ");
+ fprintf(fp, "%s", ll_addr_n2a(RTA_DATA(tb[IFLA_BROADCAST]),
+ RTA_PAYLOAD(tb[IFLA_BROADCAST]),
+ ifi->ifi_type,
+ b1, sizeof(b1)));
+ }
+ }
+ if (do_link && tb[IFLA_STATS] && show_stats) {
+ struct net_device_stats slocal;
+ struct net_device_stats *s = RTA_DATA(tb[IFLA_STATS]);
+ if (((unsigned long)s) & (sizeof(unsigned long)-1)) {
+ memcpy(&slocal, s, sizeof(slocal));
+ s = &slocal;
+ }
+ fprintf(fp, "%s", _SL_);
+ fprintf(fp, " RX: bytes packets errors dropped overrun mcast %s%s",
+ s->rx_compressed ? "compressed" : "", _SL_);
+ fprintf(fp, " %-10lu %-8lu %-7lu %-7lu %-7lu %-7lu",
+ s->rx_bytes, s->rx_packets, s->rx_errors,
+ s->rx_dropped, s->rx_over_errors,
+ s->multicast
+ );
+ if (s->rx_compressed)
+ fprintf(fp, " %-7lu", s->rx_compressed);
+ if (show_stats > 1) {
+ fprintf(fp, "%s", _SL_);
+ fprintf(fp, " RX errors: length crc frame fifo missed%s", _SL_);
+ fprintf(fp, " %-7lu %-7lu %-7lu %-7lu %-7lu",
+ s->rx_length_errors,
+ s->rx_crc_errors,
+ s->rx_frame_errors,
+ s->rx_fifo_errors,
+ s->rx_missed_errors
+ );
+ }
+ fprintf(fp, "%s", _SL_);
+ fprintf(fp, " TX: bytes packets errors dropped carrier collsns %s%s",
+ s->tx_compressed ? "compressed" : "", _SL_);
+ fprintf(fp, " %-10lu %-8lu %-7lu %-7lu %-7lu %-7lu",
+ s->tx_bytes, s->tx_packets, s->tx_errors,
+ s->tx_dropped, s->tx_carrier_errors, s->collisions);
+ if (s->tx_compressed)
+ fprintf(fp, " %-7lu", s->tx_compressed);
+ if (show_stats > 1) {
+ fprintf(fp, "%s", _SL_);
+ fprintf(fp, " TX errors: aborted fifo window heartbeat%s", _SL_);
+ fprintf(fp, " %-7lu %-7lu %-7lu %-7lu",
+ s->tx_aborted_errors,
+ s->tx_fifo_errors,
+ s->tx_window_errors,
+ s->tx_heartbeat_errors
+ );
+ }
+ }
+ fprintf(fp, "\n");
+ fflush(fp);
+ return 0;
+}
+
+static int flush_update(void)
+{
+ if (rtnl_send(filter.rth, filter.flushb, filter.flushp) < 0) {
+ perror("Failed to send flush request\n");
+ return -1;
+ }
+ filter.flushp = 0;
+ return 0;
+}
+
+int print_addrinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct ifaddrmsg *ifa = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * rta_tb[IFA_MAX+1];
+ char abuf[256];
+ SPRINT_BUF(b1);
+
+ if (n->nlmsg_type != RTM_NEWADDR && n->nlmsg_type != RTM_DELADDR)
+ return 0;
+ len -= NLMSG_LENGTH(sizeof(*ifa));
+ if (len < 0) {
+ fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+ return -1;
+ }
+
+ if (filter.flushb && n->nlmsg_type != RTM_NEWADDR)
+ return 0;
+
+ memset(rta_tb, 0, sizeof(rta_tb));
+ parse_rtattr(rta_tb, IFA_MAX, IFA_RTA(ifa), n->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa)));
+
+ if (!rta_tb[IFA_LOCAL])
+ rta_tb[IFA_LOCAL] = rta_tb[IFA_ADDRESS];
+ if (!rta_tb[IFA_ADDRESS])
+ rta_tb[IFA_ADDRESS] = rta_tb[IFA_LOCAL];
+
+ if (filter.ifindex && filter.ifindex != ifa->ifa_index)
+ return 0;
+ if ((filter.scope^ifa->ifa_scope)&filter.scopemask)
+ return 0;
+ if ((filter.flags^ifa->ifa_flags)&filter.flagmask)
+ return 0;
+ if (filter.label) {
+ SPRINT_BUF(b1);
+ const char *label;
+ if (rta_tb[IFA_LABEL])
+ label = RTA_DATA(rta_tb[IFA_LABEL]);
+ else
+ label = ll_idx_n2a(ifa->ifa_index, b1);
+ if (fnmatch(filter.label, label, 0) != 0)
+ return 0;
+ }
+ if (filter.pfx.family) {
+ if (rta_tb[IFA_LOCAL]) {
+ inet_prefix dst;
+ memset(&dst, 0, sizeof(dst));
+ dst.family = ifa->ifa_family;
+ memcpy(&dst.data, RTA_DATA(rta_tb[IFA_LOCAL]), RTA_PAYLOAD(rta_tb[IFA_LOCAL]));
+ if (inet_addr_match(&dst, &filter.pfx, filter.pfx.bitlen))
+ return 0;
+ }
+ }
+
+ if (filter.flushb) {
+ struct nlmsghdr *fn;
+ if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) {
+ if (flush_update())
+ return -1;
+ }
+ fn = (struct nlmsghdr*)(filter.flushb + NLMSG_ALIGN(filter.flushp));
+ memcpy(fn, n, n->nlmsg_len);
+ fn->nlmsg_type = RTM_DELADDR;
+ fn->nlmsg_flags = NLM_F_REQUEST;
+ fn->nlmsg_seq = ++filter.rth->seq;
+ filter.flushp = (((char*)fn) + n->nlmsg_len) - filter.flushb;
+ filter.flushed++;
+ if (show_stats < 2)
+ return 0;
+ }
+
+ if (n->nlmsg_type == RTM_DELADDR)
+ fprintf(fp, "Deleted ");
+
+ if (filter.oneline || filter.flushb)
+ fprintf(fp, "%u: %s", ifa->ifa_index, ll_index_to_name(ifa->ifa_index));
+ if (ifa->ifa_family == AF_INET)
+ fprintf(fp, " inet ");
+ else if (ifa->ifa_family == AF_INET6)
+ fprintf(fp, " inet6 ");
+ else if (ifa->ifa_family == AF_DECnet)
+ fprintf(fp, " dnet ");
+ else if (ifa->ifa_family == AF_IPX)
+ fprintf(fp, " ipx ");
+ else
+ fprintf(fp, " family %d ", ifa->ifa_family);
+
+ if (rta_tb[IFA_LOCAL]) {
+ fprintf(fp, "%s", rt_addr_n2a(ifa->ifa_family,
+ RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
+ RTA_DATA(rta_tb[IFA_LOCAL]),
+ abuf, sizeof(abuf)));
+
+ if (rta_tb[IFA_ADDRESS] == NULL ||
+ memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]), RTA_DATA(rta_tb[IFA_LOCAL]), 4) == 0) {
+ fprintf(fp, "/%d ", ifa->ifa_prefixlen);
+ } else {
+ fprintf(fp, " peer %s/%d ",
+ rt_addr_n2a(ifa->ifa_family,
+ RTA_PAYLOAD(rta_tb[IFA_ADDRESS]),
+ RTA_DATA(rta_tb[IFA_ADDRESS]),
+ abuf, sizeof(abuf)),
+ ifa->ifa_prefixlen);
+ }
+ }
+
+ if (rta_tb[IFA_BROADCAST]) {
+ fprintf(fp, "brd %s ",
+ rt_addr_n2a(ifa->ifa_family,
+ RTA_PAYLOAD(rta_tb[IFA_BROADCAST]),
+ RTA_DATA(rta_tb[IFA_BROADCAST]),
+ abuf, sizeof(abuf)));
+ }
+ if (rta_tb[IFA_ANYCAST]) {
+ fprintf(fp, "any %s ",
+ rt_addr_n2a(ifa->ifa_family,
+ RTA_PAYLOAD(rta_tb[IFA_ANYCAST]),
+ RTA_DATA(rta_tb[IFA_ANYCAST]),
+ abuf, sizeof(abuf)));
+ }
+ fprintf(fp, "scope %s ", rtnl_rtscope_n2a(ifa->ifa_scope, b1, sizeof(b1)));
+ if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ fprintf(fp, "secondary ");
+ }
+ if (ifa->ifa_flags&IFA_F_TENTATIVE) {
+ ifa->ifa_flags &= ~IFA_F_TENTATIVE;
+ fprintf(fp, "tentative ");
+ }
+ if (ifa->ifa_flags&IFA_F_DEPRECATED) {
+ ifa->ifa_flags &= ~IFA_F_DEPRECATED;
+ fprintf(fp, "deprecated ");
+ }
+ if (!(ifa->ifa_flags&IFA_F_PERMANENT)) {
+ fprintf(fp, "dynamic ");
+ } else
+ ifa->ifa_flags &= ~IFA_F_PERMANENT;
+ if (ifa->ifa_flags)
+ fprintf(fp, "flags %02x ", ifa->ifa_flags);
+ if (rta_tb[IFA_LABEL])
+ fprintf(fp, "%s", (char*)RTA_DATA(rta_tb[IFA_LABEL]));
+ if (rta_tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci = RTA_DATA(rta_tb[IFA_CACHEINFO]);
+ char buf[128];
+ fprintf(fp, "%s", _SL_);
+ if (ci->ifa_valid == 0xFFFFFFFFU)
+ sprintf(buf, "valid_lft forever");
+ else
+ sprintf(buf, "valid_lft %dsec", ci->ifa_valid);
+ if (ci->ifa_prefered == 0xFFFFFFFFU)
+ sprintf(buf+strlen(buf), " preferred_lft forever");
+ else
+ sprintf(buf+strlen(buf), " preferred_lft %dsec", ci->ifa_prefered);
+ fprintf(fp, " %s", buf);
+ }
+ fprintf(fp, "\n");
+ fflush(fp);
+ return 0;
+}
+
+
+struct nlmsg_list
+{
+ struct nlmsg_list *next;
+ struct nlmsghdr h;
+};
+
+int print_selected_addrinfo(int ifindex, struct nlmsg_list *ainfo, FILE *fp)
+{
+ for ( ;ainfo ; ainfo = ainfo->next) {
+ struct nlmsghdr *n = &ainfo->h;
+ struct ifaddrmsg *ifa = NLMSG_DATA(n);
+
+ if (n->nlmsg_type != RTM_NEWADDR)
+ continue;
+
+ if (n->nlmsg_len < NLMSG_LENGTH(sizeof(ifa)))
+ return -1;
+
+ if (ifa->ifa_index != ifindex ||
+ (filter.family && filter.family != ifa->ifa_family))
+ continue;
+
+ print_addrinfo(NULL, n, fp);
+ }
+ return 0;
+}
+
+
+int store_nlmsg(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ struct nlmsg_list **linfo = (struct nlmsg_list**)arg;
+ struct nlmsg_list *h;
+ struct nlmsg_list **lp;
+
+ h = malloc(n->nlmsg_len+sizeof(void*));
+ if (h == NULL)
+ return -1;
+
+ memcpy(&h->h, n, n->nlmsg_len);
+ h->next = NULL;
+
+ for (lp = linfo; *lp; lp = &(*lp)->next) /* NOTHING */;
+ *lp = h;
+
+ ll_remember_index(who, n, NULL);
+ return 0;
+}
+
+int ipaddr_list_or_flush(int argc, char **argv, int flush)
+{
+ struct nlmsg_list *linfo = NULL;
+ struct nlmsg_list *ainfo = NULL;
+ struct nlmsg_list *l;
+ struct rtnl_handle rth;
+ char *filter_dev = NULL;
+ int no_link = 0;
+
+ ipaddr_reset_filter(oneline);
+ filter.showqueue = 1;
+
+ if (filter.family == AF_UNSPEC)
+ filter.family = preferred_family;
+
+ if (flush) {
+ if (argc <= 0) {
+ fprintf(stderr, "Flush requires arguments.\n");
+ return -1;
+ }
+ if (filter.family == AF_PACKET) {
+ fprintf(stderr, "Cannot flush link addresses.\n");
+ return -1;
+ }
+ }
+
+ while (argc > 0) {
+ if (strcmp(*argv, "to") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.pfx, *argv, filter.family);
+ if (filter.family == AF_UNSPEC)
+ filter.family = filter.pfx.family;
+ } else if (strcmp(*argv, "scope") == 0) {
+ int scope = 0;
+ NEXT_ARG();
+ filter.scopemask = -1;
+ if (rtnl_rtscope_a2n(&scope, *argv)) {
+ if (strcmp(*argv, "all") != 0)
+ invarg("invalid \"scope\"\n", *argv);
+ scope = RT_SCOPE_NOWHERE;
+ filter.scopemask = 0;
+ }
+ filter.scope = scope;
+ } else if (strcmp(*argv, "up") == 0) {
+ filter.up = 1;
+ } else if (strcmp(*argv, "dynamic") == 0) {
+ filter.flags &= ~IFA_F_PERMANENT;
+ filter.flagmask |= IFA_F_PERMANENT;
+ } else if (strcmp(*argv, "permanent") == 0) {
+ filter.flags |= IFA_F_PERMANENT;
+ filter.flagmask |= IFA_F_PERMANENT;
+ } else if (strcmp(*argv, "secondary") == 0) {
+ filter.flags |= IFA_F_SECONDARY;
+ filter.flagmask |= IFA_F_SECONDARY;
+ } else if (strcmp(*argv, "primary") == 0) {
+ filter.flags &= ~IFA_F_SECONDARY;
+ filter.flagmask |= IFA_F_SECONDARY;
+ } else if (strcmp(*argv, "tentative") == 0) {
+ filter.flags |= IFA_F_TENTATIVE;
+ filter.flagmask |= IFA_F_TENTATIVE;
+ } else if (strcmp(*argv, "deprecated") == 0) {
+ filter.flags |= IFA_F_DEPRECATED;
+ filter.flagmask |= IFA_F_DEPRECATED;
+ } else if (strcmp(*argv, "label") == 0) {
+ NEXT_ARG();
+ filter.label = *argv;
+ } else {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (filter_dev)
+ duparg2("dev", *argv);
+ filter_dev = *argv;
+ }
+ argv++; argc--;
+ }
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ if (rtnl_wilddump_request(&rth, preferred_family, RTM_GETLINK) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, store_nlmsg, &linfo, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ if (filter_dev) {
+ filter.ifindex = ll_name_to_index(filter_dev);
+ if (filter.ifindex <= 0) {
+ fprintf(stderr, "Device \"%s\" does not exist.\n", filter_dev);
+ return -1;
+ }
+ }
+
+ if (flush) {
+ int round = 0;
+ char flushb[4096-512];
+
+ filter.flushb = flushb;
+ filter.flushp = 0;
+ filter.flushe = sizeof(flushb);
+ filter.rth = &rth;
+
+ for (;;) {
+ if (rtnl_wilddump_request(&rth, filter.family, RTM_GETADDR) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+ filter.flushed = 0;
+ if (rtnl_dump_filter(&rth, print_addrinfo, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Flush terminated\n");
+ exit(1);
+ }
+ if (filter.flushed == 0) {
+ if (round == 0) {
+ fprintf(stderr, "Nothing to flush.\n");
+ } else if (show_stats)
+ printf("*** Flush is complete after %d round%s ***\n", round, round>1?"s":"");
+ fflush(stdout);
+ return 0;
+ }
+ round++;
+ if (flush_update() < 0)
+ exit(1);
+ if (show_stats) {
+ printf("\n*** Round %d, deleting %d addresses ***\n", round, filter.flushed);
+ fflush(stdout);
+ }
+ }
+ }
+
+ if (filter.family != AF_PACKET) {
+ if (rtnl_wilddump_request(&rth, filter.family, RTM_GETADDR) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, store_nlmsg, &ainfo, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+ }
+
+
+ if (filter.family && filter.family != AF_PACKET) {
+ struct nlmsg_list **lp;
+ lp=&linfo;
+
+ if (filter.oneline)
+ no_link = 1;
+
+ while ((l=*lp)!=NULL) {
+ int ok = 0;
+ struct ifinfomsg *ifi = NLMSG_DATA(&l->h);
+ struct nlmsg_list *a;
+
+ for (a=ainfo; a; a=a->next) {
+ struct nlmsghdr *n = &a->h;
+ struct ifaddrmsg *ifa = NLMSG_DATA(n);
+
+ if (ifa->ifa_index != ifi->ifi_index ||
+ (filter.family && filter.family != ifa->ifa_family))
+ continue;
+ if ((filter.scope^ifa->ifa_scope)&filter.scopemask)
+ continue;
+ if ((filter.flags^ifa->ifa_flags)&filter.flagmask)
+ continue;
+ if (filter.pfx.family || filter.label) {
+ struct rtattr *tb[IFA_MAX+1];
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, IFA_MAX, IFA_RTA(ifa), IFA_PAYLOAD(n));
+ if (!tb[IFA_LOCAL])
+ tb[IFA_LOCAL] = tb[IFA_ADDRESS];
+
+ if (filter.pfx.family && tb[IFA_LOCAL]) {
+ inet_prefix dst;
+ memset(&dst, 0, sizeof(dst));
+ dst.family = ifa->ifa_family;
+ memcpy(&dst.data, RTA_DATA(tb[IFA_LOCAL]), RTA_PAYLOAD(tb[IFA_LOCAL]));
+ if (inet_addr_match(&dst, &filter.pfx, filter.pfx.bitlen))
+ continue;
+ }
+ if (filter.label) {
+ SPRINT_BUF(b1);
+ const char *label;
+ if (tb[IFA_LABEL])
+ label = RTA_DATA(tb[IFA_LABEL]);
+ else
+ label = ll_idx_n2a(ifa->ifa_index, b1);
+ if (fnmatch(filter.label, label, 0) != 0)
+ continue;
+ }
+ }
+
+ ok = 1;
+ break;
+ }
+ if (!ok)
+ *lp = l->next;
+ else
+ lp = &l->next;
+ }
+ }
+
+ for (l=linfo; l; l = l->next) {
+ if (no_link || print_linkinfo(NULL, &l->h, stdout) == 0) {
+ struct ifinfomsg *ifi = NLMSG_DATA(&l->h);
+ if (filter.family != AF_PACKET)
+ print_selected_addrinfo(ifi->ifi_index, ainfo, stdout);
+ }
+ fflush(stdout);
+ }
+
+ exit(0);
+}
+
+int ipaddr_list_link(int argc, char **argv)
+{
+ preferred_family = AF_PACKET;
+ do_link = 1;
+ return ipaddr_list_or_flush(argc, argv, 0);
+}
+
+void ipaddr_reset_filter(int oneline)
+{
+ memset(&filter, 0, sizeof(filter));
+ filter.oneline = oneline;
+}
+
+int default_scope(inet_prefix *lcl)
+{
+ if (lcl->family == AF_INET) {
+ if (lcl->bytelen >= 1 && *(__u8*)&lcl->data == 127)
+ return RT_SCOPE_HOST;
+ }
+ return 0;
+}
+
+int ipaddr_modify(int cmd, int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct ifaddrmsg ifa;
+ char buf[256];
+ } req;
+ char *d = NULL;
+ char *l = NULL;
+ inet_prefix lcl;
+ inet_prefix peer;
+ int local_len = 0;
+ int peer_len = 0;
+ int brd_len = 0;
+ int any_len = 0;
+ int scoped = 0;
+
+ memset(&req, 0, sizeof(req));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST;
+ req.n.nlmsg_type = cmd;
+ req.ifa.ifa_family = preferred_family;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "peer") == 0 ||
+ strcmp(*argv, "remote") == 0) {
+ NEXT_ARG();
+
+ if (peer_len)
+ duparg("peer", *argv);
+ get_prefix(&peer, *argv, req.ifa.ifa_family);
+ peer_len = peer.bytelen;
+ if (req.ifa.ifa_family == AF_UNSPEC)
+ req.ifa.ifa_family = peer.family;
+ addattr_l(&req.n, sizeof(req), IFA_ADDRESS, &peer.data, peer.bytelen);
+ req.ifa.ifa_prefixlen = peer.bitlen;
+ } else if (matches(*argv, "broadcast") == 0 ||
+ strcmp(*argv, "brd") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ if (brd_len)
+ duparg("broadcast", *argv);
+ if (strcmp(*argv, "+") == 0)
+ brd_len = -1;
+ else if (strcmp(*argv, "-") == 0)
+ brd_len = -2;
+ else {
+ get_addr(&addr, *argv, req.ifa.ifa_family);
+ if (req.ifa.ifa_family == AF_UNSPEC)
+ req.ifa.ifa_family = addr.family;
+ addattr_l(&req.n, sizeof(req), IFA_BROADCAST, &addr.data, addr.bytelen);
+ brd_len = addr.bytelen;
+ }
+ } else if (strcmp(*argv, "anycast") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ if (any_len)
+ duparg("anycast", *argv);
+ get_addr(&addr, *argv, req.ifa.ifa_family);
+ if (req.ifa.ifa_family == AF_UNSPEC)
+ req.ifa.ifa_family = addr.family;
+ addattr_l(&req.n, sizeof(req), IFA_ANYCAST, &addr.data, addr.bytelen);
+ any_len = addr.bytelen;
+ } else if (strcmp(*argv, "scope") == 0) {
+ int scope = 0;
+ NEXT_ARG();
+ if (rtnl_rtscope_a2n(&scope, *argv))
+ invarg(*argv, "invalid scope value.");
+ req.ifa.ifa_scope = scope;
+ scoped = 1;
+ } else if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ d = *argv;
+ } else if (strcmp(*argv, "label") == 0) {
+ NEXT_ARG();
+ l = *argv;
+ addattr_l(&req.n, sizeof(req), IFA_LABEL, l, strlen(l)+1);
+ } else {
+ if (strcmp(*argv, "local") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (local_len)
+ duparg2("local", *argv);
+ get_prefix(&lcl, *argv, req.ifa.ifa_family);
+ if (req.ifa.ifa_family == AF_UNSPEC)
+ req.ifa.ifa_family = lcl.family;
+ addattr_l(&req.n, sizeof(req), IFA_LOCAL, &lcl.data, lcl.bytelen);
+ local_len = lcl.bytelen;
+ }
+ argc--; argv++;
+ }
+ if (d == NULL) {
+ fprintf(stderr, "Not enough information: \"dev\" argument is required.\n");
+ return -1;
+ }
+ if (l && matches(d, l) != 0) {
+ fprintf(stderr, "\"dev\" (%s) must match \"label\" (%s).\n", d, l);
+ exit(1);
+ }
+
+ if (peer_len == 0 && local_len && cmd != RTM_DELADDR) {
+ peer = lcl;
+ addattr_l(&req.n, sizeof(req), IFA_ADDRESS, &lcl.data, lcl.bytelen);
+ }
+ if (req.ifa.ifa_prefixlen == 0)
+ req.ifa.ifa_prefixlen = lcl.bitlen;
+
+ if (brd_len < 0 && cmd != RTM_DELADDR) {
+ inet_prefix brd;
+ int i;
+ if (req.ifa.ifa_family != AF_INET) {
+ fprintf(stderr, "Broadcast can be set only for IPv4 addresses\n");
+ return -1;
+ }
+ brd = peer;
+ if (brd.bitlen <= 30) {
+ for (i=31; i>=brd.bitlen; i--) {
+ if (brd_len == -1)
+ brd.data[0] |= htonl(1<<(31-i));
+ else
+ brd.data[0] &= ~htonl(1<<(31-i));
+ }
+ addattr_l(&req.n, sizeof(req), IFA_BROADCAST, &brd.data, brd.bytelen);
+ brd_len = brd.bytelen;
+ }
+ }
+ if (!scoped && cmd != RTM_DELADDR)
+ req.ifa.ifa_scope = default_scope(&lcl);
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ ll_init_map(&rth);
+
+ if ((req.ifa.ifa_index = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ return -1;
+ }
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ exit(2);
+
+ exit(0);
+}
+
+int do_ipaddr(int argc, char **argv)
+{
+ if (argc < 1)
+ return ipaddr_list_or_flush(0, NULL, 0);
+ if (matches(*argv, "add") == 0)
+ return ipaddr_modify(RTM_NEWADDR, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return ipaddr_modify(RTM_DELADDR, argc-1, argv+1);
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return ipaddr_list_or_flush(argc-1, argv+1, 0);
+ if (matches(*argv, "flush") == 0)
+ return ipaddr_list_or_flush(argc-1, argv+1, 1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip address help\".\n", *argv);
+ exit(-1);
+}
+
diff --git a/ip/iplink.c b/ip/iplink.c
index e69de29b..1fc3dcfd 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -0,0 +1,397 @@
+/*
+ * iplink.c "ip link".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+#include <linux/if_packet.h>
+#include <linux/if_ether.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+
+static void usage(void) __attribute__((noreturn));
+
+void iplink_usage(void)
+{
+ fprintf(stderr, "Usage: ip link set DEVICE { up | down | arp { on | off } |\n");
+ fprintf(stderr, " dynamic { on | off } |\n");
+ fprintf(stderr, " multicast { on | off } | txqueuelen PACKETS |\n");
+ fprintf(stderr, " name NEWNAME |\n");
+ fprintf(stderr, " address LLADDR | broadcast LLADDR |\n");
+ fprintf(stderr, " mtu MTU }\n");
+ fprintf(stderr, " ip link show [ DEVICE ]\n");
+ exit(-1);
+}
+
+static void usage(void)
+{
+ iplink_usage();
+}
+
+static int on_off(char *msg)
+{
+ fprintf(stderr, "Error: argument of \"%s\" must be \"on\" or \"off\"\n", msg);
+ return -1;
+}
+
+static int get_ctl_fd(void)
+{
+ int s_errno;
+ int fd;
+
+ fd = socket(PF_INET, SOCK_DGRAM, 0);
+ if (fd >= 0)
+ return fd;
+ s_errno = errno;
+ fd = socket(PF_PACKET, SOCK_DGRAM, 0);
+ if (fd >= 0)
+ return fd;
+ fd = socket(PF_INET6, SOCK_DGRAM, 0);
+ if (fd >= 0)
+ return fd;
+ errno = s_errno;
+ perror("Cannot create control socket");
+ return -1;
+}
+
+static int do_chflags(char *dev, __u32 flags, __u32 mask)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ strcpy(ifr.ifr_name, dev);
+ fd = get_ctl_fd();
+ if (fd < 0)
+ return -1;
+ err = ioctl(fd, SIOCGIFFLAGS, &ifr);
+ if (err) {
+ perror("SIOCGIFFLAGS");
+ close(fd);
+ return -1;
+ }
+ if ((ifr.ifr_flags^flags)&mask) {
+ ifr.ifr_flags &= ~mask;
+ ifr.ifr_flags |= mask&flags;
+ err = ioctl(fd, SIOCSIFFLAGS, &ifr);
+ if (err)
+ perror("SIOCSIFFLAGS");
+ }
+ close(fd);
+ return err;
+}
+
+static int do_changename(char *dev, char *newdev)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ strcpy(ifr.ifr_name, dev);
+ strcpy(ifr.ifr_newname, newdev);
+ fd = get_ctl_fd();
+ if (fd < 0)
+ return -1;
+ err = ioctl(fd, SIOCSIFNAME, &ifr);
+ if (err) {
+ perror("SIOCSIFNAME");
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ return err;
+}
+
+static int set_qlen(char *dev, int qlen)
+{
+ struct ifreq ifr;
+ int s;
+
+ s = get_ctl_fd();
+ if (s < 0)
+ return -1;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, dev);
+ ifr.ifr_qlen = qlen;
+ if (ioctl(s, SIOCSIFTXQLEN, &ifr) < 0) {
+ perror("SIOCSIFXQLEN");
+ close(s);
+ return -1;
+ }
+ close(s);
+
+ return 0;
+}
+
+static int set_mtu(char *dev, int mtu)
+{
+ struct ifreq ifr;
+ int s;
+
+ s = get_ctl_fd();
+ if (s < 0)
+ return -1;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, dev);
+ ifr.ifr_mtu = mtu;
+ if (ioctl(s, SIOCSIFMTU, &ifr) < 0) {
+ perror("SIOCSIFMTU");
+ close(s);
+ return -1;
+ }
+ close(s);
+
+ return 0;
+}
+
+static int get_address(char *dev, int *htype)
+{
+ struct ifreq ifr;
+ struct sockaddr_ll me;
+ int alen;
+ int s;
+
+ s = socket(PF_PACKET, SOCK_DGRAM, 0);
+ if (s < 0) {
+ perror("socket(PF_PACKET)");
+ return -1;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, dev);
+ if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
+ perror("SIOCGIFINDEX");
+ close(s);
+ return -1;
+ }
+
+ memset(&me, 0, sizeof(me));
+ me.sll_family = AF_PACKET;
+ me.sll_ifindex = ifr.ifr_ifindex;
+ me.sll_protocol = htons(ETH_P_LOOP);
+ if (bind(s, (struct sockaddr*)&me, sizeof(me)) == -1) {
+ perror("bind");
+ close(s);
+ return -1;
+ }
+
+ alen = sizeof(me);
+ if (getsockname(s, (struct sockaddr*)&me, &alen) == -1) {
+ perror("getsockname");
+ close(s);
+ return -1;
+ }
+ close(s);
+ *htype = me.sll_hatype;
+ return me.sll_halen;
+}
+
+static int parse_address(char *dev, int hatype, int halen, char *lla, struct ifreq *ifr)
+{
+ int alen;
+
+ memset(ifr, 0, sizeof(*ifr));
+ strcpy(ifr->ifr_name, dev);
+ ifr->ifr_hwaddr.sa_family = hatype;
+ alen = ll_addr_a2n(ifr->ifr_hwaddr.sa_data, 14, lla);
+ if (alen < 0)
+ return -1;
+ if (alen != halen) {
+ fprintf(stderr, "Wrong address (%s) length: expected %d bytes\n", lla, halen);
+ return -1;
+ }
+ return 0;
+}
+
+static int set_address(struct ifreq *ifr, int brd)
+{
+ int s;
+
+ s = get_ctl_fd();
+ if (s < 0)
+ return -1;
+ if (ioctl(s, brd?SIOCSIFHWBROADCAST:SIOCSIFHWADDR, ifr) < 0) {
+ perror(brd?"SIOCSIFHWBROADCAST":"SIOCSIFHWADDR");
+ close(s);
+ return -1;
+ }
+ close(s);
+ return 0;
+}
+
+
+static int do_set(int argc, char **argv)
+{
+ char *dev = NULL;
+ __u32 mask = 0;
+ __u32 flags = 0;
+ int qlen = -1;
+ int mtu = -1;
+ char *newaddr = NULL;
+ char *newbrd = NULL;
+ struct ifreq ifr0, ifr1;
+ char *newname = NULL;
+ int htype, halen;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "up") == 0) {
+ mask |= IFF_UP;
+ flags |= IFF_UP;
+ } else if (strcmp(*argv, "down") == 0) {
+ mask |= IFF_UP;
+ flags &= ~IFF_UP;
+ } else if (strcmp(*argv, "name") == 0) {
+ NEXT_ARG();
+ newname = *argv;
+ } else if (matches(*argv, "address") == 0) {
+ NEXT_ARG();
+ newaddr = *argv;
+ } else if (matches(*argv, "broadcast") == 0 ||
+ strcmp(*argv, "brd") == 0) {
+ NEXT_ARG();
+ newbrd = *argv;
+ } else if (matches(*argv, "txqueuelen") == 0 ||
+ strcmp(*argv, "qlen") == 0 ||
+ matches(*argv, "txqlen") == 0) {
+ NEXT_ARG();
+ if (qlen != -1)
+ duparg("txqueuelen", *argv);
+ if (get_integer(&qlen, *argv, 0))
+ invarg("Invalid \"txqueuelen\" value\n", *argv);
+ } else if (strcmp(*argv, "mtu") == 0) {
+ NEXT_ARG();
+ if (mtu != -1)
+ duparg("mtu", *argv);
+ if (get_integer(&mtu, *argv, 0))
+ invarg("Invalid \"mtu\" value\n", *argv);
+ } else if (strcmp(*argv, "multicast") == 0) {
+ NEXT_ARG();
+ mask |= IFF_MULTICAST;
+ if (strcmp(*argv, "on") == 0) {
+ flags |= IFF_MULTICAST;
+ } else if (strcmp(*argv, "off") == 0) {
+ flags &= ~IFF_MULTICAST;
+ } else
+ return on_off("multicast");
+ } else if (strcmp(*argv, "arp") == 0) {
+ NEXT_ARG();
+ mask |= IFF_NOARP;
+ if (strcmp(*argv, "on") == 0) {
+ flags &= ~IFF_NOARP;
+ } else if (strcmp(*argv, "off") == 0) {
+ flags |= IFF_NOARP;
+ } else
+ return on_off("noarp");
+#ifdef IFF_DYNAMIC
+ } else if (matches(*argv, "dynamic") == 0) {
+ NEXT_ARG();
+ mask |= IFF_DYNAMIC;
+ if (strcmp(*argv, "on") == 0) {
+ flags |= IFF_DYNAMIC;
+ } else if (strcmp(*argv, "off") == 0) {
+ flags &= ~IFF_DYNAMIC;
+ } else
+ return on_off("dynamic");
+#endif
+ } else {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (dev)
+ duparg2("dev", *argv);
+ dev = *argv;
+ }
+ argc--; argv++;
+ }
+
+ if (!dev) {
+ fprintf(stderr, "Not enough of information: \"dev\" argument is required.\n");
+ exit(-1);
+ }
+
+ if (newaddr || newbrd) {
+ halen = get_address(dev, &htype);
+ if (halen < 0)
+ return -1;
+ if (newaddr) {
+ if (parse_address(dev, htype, halen, newaddr, &ifr0) < 0)
+ return -1;
+ }
+ if (newbrd) {
+ if (parse_address(dev, htype, halen, newbrd, &ifr1) < 0)
+ return -1;
+ }
+ }
+
+ if (newname && strcmp(dev, newname)) {
+ if (do_changename(dev, newname) < 0)
+ return -1;
+ dev = newname;
+ }
+ if (qlen != -1) {
+ if (set_qlen(dev, qlen) < 0)
+ return -1;
+ }
+ if (mtu != -1) {
+ if (set_mtu(dev, mtu) < 0)
+ return -1;
+ }
+ if (newaddr || newbrd) {
+ if (newbrd) {
+ if (set_address(&ifr1, 1) < 0)
+ return -1;
+ }
+ if (newaddr) {
+ if (set_address(&ifr0, 0) < 0)
+ return -1;
+ }
+ }
+ if (mask)
+ return do_chflags(dev, flags, mask);
+ return 0;
+}
+
+int do_iplink(int argc, char **argv)
+{
+ if (argc > 0) {
+ if (matches(*argv, "set") == 0)
+ return do_set(argc-1, argv+1);
+ if (matches(*argv, "show") == 0 ||
+ matches(*argv, "lst") == 0 ||
+ matches(*argv, "list") == 0)
+ return ipaddr_list_link(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ } else
+ return ipaddr_list_link(0, NULL);
+
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip link help\".\n", *argv);
+ exit(-1);
+}
diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index e69de29b..b2c4adc0 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -0,0 +1,342 @@
+/*
+ * ipmaddr.c "ip maddress".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "rt_names.h"
+#include "utils.h"
+
+static struct {
+ char *dev;
+ int family;
+} filter;
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip maddr [ add | del ] MULTIADDR dev STRING\n");
+ fprintf(stderr, " ip maddr show [ dev STRING ]\n");
+ exit(-1);
+}
+
+static int parse_hex(char *str, unsigned char *addr)
+{
+ int len=0;
+
+ while (*str) {
+ int tmp;
+ if (str[1] == 0)
+ return -1;
+ if (sscanf(str, "%02x", &tmp) != 1)
+ return -1;
+ addr[len] = tmp;
+ len++;
+ str += 2;
+ }
+ return len;
+}
+
+struct ma_info
+{
+ struct ma_info *next;
+ int index;
+ int users;
+ char *features;
+ char name[IFNAMSIZ];
+ inet_prefix addr;
+};
+
+void maddr_ins(struct ma_info **lst, struct ma_info *m)
+{
+ struct ma_info *mp;
+
+ for (; (mp=*lst) != NULL; lst = &mp->next) {
+ if (mp->index > m->index)
+ break;
+ }
+ m->next = *lst;
+ *lst = m;
+}
+
+void read_dev_mcast(struct ma_info **result_p)
+{
+ char buf[256];
+ FILE *fp = fopen("/proc/net/dev_mcast", "r");
+
+ if (!fp)
+ return;
+
+ while (fgets(buf, sizeof(buf), fp)) {
+ char hexa[256];
+ struct ma_info m;
+ int len;
+ int st;
+
+ memset(&m, 0, sizeof(m));
+ sscanf(buf, "%d%s%d%d%s", &m.index, m.name, &m.users, &st,
+ hexa);
+ if (filter.dev && strcmp(filter.dev, m.name))
+ continue;
+
+ m.addr.family = AF_PACKET;
+
+ len = parse_hex(hexa, (unsigned char*)&m.addr.data);
+ if (len >= 0) {
+ struct ma_info *ma = malloc(sizeof(m));
+
+ memcpy(ma, &m, sizeof(m));
+ ma->addr.bytelen = len;
+ ma->addr.bitlen = len<<3;
+ if (st)
+ ma->features = "static";
+ maddr_ins(result_p, ma);
+ }
+ }
+ fclose(fp);
+}
+
+void read_igmp(struct ma_info **result_p)
+{
+ struct ma_info m;
+ char buf[256];
+ FILE *fp = fopen("/proc/net/igmp", "r");
+
+ if (!fp)
+ return;
+ memset(&m, 0, sizeof(m));
+ fgets(buf, sizeof(buf), fp);
+
+ m.addr.family = AF_INET;
+ m.addr.bitlen = 32;
+ m.addr.bytelen = 4;
+
+ while (fgets(buf, sizeof(buf), fp)) {
+ struct ma_info *ma = malloc(sizeof(m));
+
+ if (buf[0] != '\t') {
+ sscanf(buf, "%d%s", &m.index, m.name);
+ continue;
+ }
+
+ if (filter.dev && strcmp(filter.dev, m.name))
+ continue;
+
+ sscanf(buf, "%08x%d", (__u32*)&m.addr.data, &m.users);
+
+ ma = malloc(sizeof(m));
+ memcpy(ma, &m, sizeof(m));
+ maddr_ins(result_p, ma);
+ }
+ fclose(fp);
+}
+
+
+void read_igmp6(struct ma_info **result_p)
+{
+ char buf[256];
+ FILE *fp = fopen("/proc/net/igmp6", "r");
+
+ if (!fp)
+ return;
+
+ while (fgets(buf, sizeof(buf), fp)) {
+ char hexa[256];
+ struct ma_info m;
+ int len;
+
+ memset(&m, 0, sizeof(m));
+ sscanf(buf, "%d%s%s%d", &m.index, m.name, hexa, &m.users);
+
+ if (filter.dev && strcmp(filter.dev, m.name))
+ continue;
+
+ m.addr.family = AF_INET6;
+
+ len = parse_hex(hexa, (unsigned char*)&m.addr.data);
+ if (len >= 0) {
+ struct ma_info *ma = malloc(sizeof(m));
+
+ memcpy(ma, &m, sizeof(m));
+
+ ma->addr.bytelen = len;
+ ma->addr.bitlen = len<<3;
+ maddr_ins(result_p, ma);
+ }
+ }
+ fclose(fp);
+}
+
+static void print_maddr(FILE *fp, struct ma_info *list)
+{
+ fprintf(fp, "\t");
+
+ if (list->addr.family == AF_PACKET) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "link %s", ll_addr_n2a((unsigned char*)list->addr.data,
+ list->addr.bytelen, 0,
+ b1, sizeof(b1)));
+ } else {
+ char abuf[256];
+ switch(list->addr.family) {
+ case AF_INET:
+ fprintf(fp, "inet ");
+ break;
+ case AF_INET6:
+ fprintf(fp, "inet6 ");
+ break;
+ default:
+ fprintf(fp, "family %d ", list->addr.family);
+ break;
+ }
+ fprintf(fp, "%s",
+ format_host(list->addr.family,
+ -1,
+ list->addr.data,
+ abuf, sizeof(abuf)));
+ }
+ if (list->users != 1)
+ fprintf(fp, " users %d", list->users);
+ if (list->features)
+ fprintf(fp, " %s", list->features);
+ fprintf(fp, "\n");
+}
+
+static void print_mlist(FILE *fp, struct ma_info *list)
+{
+ int cur_index = 0;
+
+ for (; list; list = list->next) {
+ if (oneline) {
+ cur_index = list->index;
+ fprintf(fp, "%d:\t%s%s", cur_index, list->name, _SL_);
+ } else if (cur_index != list->index) {
+ cur_index = list->index;
+ fprintf(fp, "%d:\t%s\n", cur_index, list->name);
+ }
+ print_maddr(fp, list);
+ }
+}
+
+static int multiaddr_list(int argc, char **argv)
+{
+ struct ma_info *list = NULL;
+
+ if (!filter.family)
+ filter.family = preferred_family;
+
+ while (argc > 0) {
+ if (1) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (filter.dev)
+ duparg2("dev", *argv);
+ filter.dev = *argv;
+ }
+ argv++; argc--;
+ }
+
+ if (!filter.family || filter.family == AF_PACKET)
+ read_dev_mcast(&list);
+ if (!filter.family || filter.family == AF_INET)
+ read_igmp(&list);
+ if (!filter.family || filter.family == AF_INET6)
+ read_igmp6(&list);
+ print_mlist(stdout, list);
+ return 0;
+}
+
+int multiaddr_modify(int cmd, int argc, char **argv)
+{
+ struct ifreq ifr;
+ int fd;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ if (cmd == RTM_NEWADDR)
+ cmd = SIOCADDMULTI;
+ else
+ cmd = SIOCDELMULTI;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (ifr.ifr_name[0])
+ duparg("dev", *argv);
+ strncpy(ifr.ifr_name, *argv, IFNAMSIZ);
+ } else {
+ if (matches(*argv, "address") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (ifr.ifr_hwaddr.sa_data[0])
+ duparg("address", *argv);
+ if (ll_addr_a2n(ifr.ifr_hwaddr.sa_data, 14, *argv) < 0) {
+ fprintf(stderr, "Error: \"%s\" is not a legal ll address.\n", *argv);
+ exit(1);
+ }
+ }
+ argc--; argv++;
+ }
+ if (ifr.ifr_name[0] == 0) {
+ fprintf(stderr, "Not enough information: \"dev\" is required.\n");
+ exit(-1);
+ }
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0) {
+ perror("Cannot create socket");
+ exit(1);
+ }
+ if (ioctl(fd, cmd, (char*)&ifr) != 0) {
+ perror("ioctl");
+ exit(1);
+ }
+ close(fd);
+
+ exit(0);
+}
+
+
+int do_multiaddr(int argc, char **argv)
+{
+ if (argc < 1)
+ return multiaddr_list(0, NULL);
+ if (matches(*argv, "add") == 0)
+ return multiaddr_modify(RTM_NEWADDR, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return multiaddr_modify(RTM_DELADDR, argc-1, argv+1);
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return multiaddr_list(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip maddr help\".\n", *argv);
+ exit(-1);
+}
diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index e69de29b..9ed6bbaf 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -0,0 +1,152 @@
+/*
+ * ipmonitor.c "ip monitor".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <time.h>
+
+#include "utils.h"
+#include "ip_common.h"
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip monitor [ all | LISTofOBJECTS ]\n");
+ exit(-1);
+}
+
+
+int accept_msg(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+
+ if (n->nlmsg_type == RTM_NEWROUTE || n->nlmsg_type == RTM_DELROUTE) {
+ print_route(who, n, arg);
+ return 0;
+ }
+ if (n->nlmsg_type == RTM_NEWLINK || n->nlmsg_type == RTM_DELLINK) {
+ ll_remember_index(who, n, NULL);
+ print_linkinfo(who, n, arg);
+ return 0;
+ }
+ if (n->nlmsg_type == RTM_NEWADDR || n->nlmsg_type == RTM_DELADDR) {
+ print_addrinfo(who, n, arg);
+ return 0;
+ }
+ if (n->nlmsg_type == RTM_NEWNEIGH || n->nlmsg_type == RTM_DELNEIGH) {
+ print_neigh(who, n, arg);
+ return 0;
+ }
+ if (n->nlmsg_type == 15) {
+ char *tstr;
+ time_t secs = ((__u32*)NLMSG_DATA(n))[0];
+ long usecs = ((__u32*)NLMSG_DATA(n))[1];
+ tstr = asctime(localtime(&secs));
+ tstr[strlen(tstr)-1] = 0;
+ fprintf(fp, "Timestamp: %s %lu us\n", tstr, usecs);
+ return 0;
+ }
+ if (n->nlmsg_type == RTM_NEWQDISC ||
+ n->nlmsg_type == RTM_DELQDISC ||
+ n->nlmsg_type == RTM_NEWTCLASS ||
+ n->nlmsg_type == RTM_DELTCLASS ||
+ n->nlmsg_type == RTM_NEWTFILTER ||
+ n->nlmsg_type == RTM_DELTFILTER)
+ return 0;
+ if (n->nlmsg_type != NLMSG_ERROR && n->nlmsg_type != NLMSG_NOOP &&
+ n->nlmsg_type != NLMSG_DONE) {
+ fprintf(fp, "Unknown message: %08x %08x %08x\n",
+ n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+ }
+ return 0;
+}
+
+int do_ipmonitor(int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ char *file = NULL;
+ unsigned groups = ~RTMGRP_TC;
+ int llink=0;
+ int laddr=0;
+ int lroute=0;
+
+ ipaddr_reset_filter(1);
+ iproute_reset_filter();
+ ipneigh_reset_filter();
+
+ while (argc > 0) {
+ if (matches(*argv, "file") == 0) {
+ NEXT_ARG();
+ file = *argv;
+ } else if (matches(*argv, "link") == 0) {
+ llink=1;
+ groups = 0;
+ } else if (matches(*argv, "address") == 0) {
+ laddr=1;
+ groups = 0;
+ } else if (matches(*argv, "route") == 0) {
+ lroute=1;
+ groups = 0;
+ } else if (strcmp(*argv, "all") == 0) {
+ groups = ~RTMGRP_TC;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, "Argument \"%s\" is unknown, try \"ip monitor help\".\n", *argv);
+ exit(-1);
+ }
+ argc--; argv++;
+ }
+
+ if (llink)
+ groups |= RTMGRP_LINK;
+ if (laddr) {
+ if (!preferred_family || preferred_family == AF_INET)
+ groups |= RTMGRP_IPV4_IFADDR;
+ if (!preferred_family || preferred_family == AF_INET6)
+ groups |= RTMGRP_IPV6_IFADDR;
+ }
+ if (lroute) {
+ if (!preferred_family || preferred_family == AF_INET)
+ groups |= RTMGRP_IPV4_ROUTE;
+ if (!preferred_family || preferred_family == AF_INET6)
+ groups |= RTMGRP_IPV6_ROUTE;
+ }
+
+ if (file) {
+ FILE *fp;
+ fp = fopen(file, "r");
+ if (fp == NULL) {
+ perror("Cannot fopen");
+ exit(-1);
+ }
+ return rtnl_from_file(fp, accept_msg, (void*)stdout);
+ }
+
+ if (rtnl_open(&rth, groups) < 0)
+ exit(1);
+
+ ll_init_map(&rth);
+
+ if (rtnl_listen(&rth, accept_msg, (void*)stdout) < 0)
+ exit(2);
+
+ exit(0);
+}
diff --git a/ip/ipmroute.c b/ip/ipmroute.c
index e69de29b..01e876bc 100644
--- a/ip/ipmroute.c
+++ b/ip/ipmroute.c
@@ -0,0 +1,204 @@
+/*
+ * ipmroute.c "ip mroute".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+
+char filter_dev[16];
+int filter_family;
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip mroute show [ PREFIX ] [ from PREFIX ] [ iif DEVICE ]\n");
+#if 0
+ fprintf(stderr, "Usage: ip mroute [ add | del ] DESTINATION from SOURCE [ iif DEVICE ] [ oif DEVICE ]\n");
+#endif
+ exit(-1);
+}
+
+char *viftable[32];
+
+struct rtfilter
+{
+ inet_prefix mdst;
+ inet_prefix msrc;
+} filter;
+
+void read_viftable(void)
+{
+ char buf[256];
+ FILE *fp = fopen("/proc/net/ip_mr_vif", "r");
+
+ if (!fp)
+ return;
+
+ fgets(buf, sizeof(buf), fp);
+
+ while (fgets(buf, sizeof(buf), fp)) {
+ int vifi;
+ char dev[256];
+
+ if (sscanf(buf, "%d%s", &vifi, dev) < 2)
+ continue;
+
+ if (vifi<0 || vifi>31)
+ continue;
+
+ viftable[vifi] = strdup(dev);
+ }
+ fclose(fp);
+}
+
+void read_mroute_list(FILE *ofp)
+{
+ char buf[256];
+ FILE *fp = fopen("/proc/net/ip_mr_cache", "r");
+
+ if (!fp)
+ return;
+
+ fgets(buf, sizeof(buf), fp);
+
+ while (fgets(buf, sizeof(buf), fp)) {
+ inet_prefix maddr, msrc;
+ unsigned pkts, b, w;
+ int vifi;
+ char oiflist[256];
+ char sbuf[256];
+ char mbuf[256];
+ char obuf[256];
+
+ oiflist[0] = 0;
+ if (sscanf(buf, "%x%x%d%u%u%u%s", maddr.data, msrc.data, &vifi,
+ &pkts, &b, &w, oiflist) < 6)
+ continue;
+
+ if (vifi!=-1 && (vifi < 0 || vifi>31))
+ continue;
+
+ if (filter_dev[0] && (vifi<0 || strcmp(filter_dev, viftable[vifi])))
+ continue;
+ if (filter.mdst.family && inet_addr_match(&maddr, &filter.mdst, filter.mdst.bitlen))
+ continue;
+ if (filter.msrc.family && inet_addr_match(&msrc, &filter.msrc, filter.msrc.bitlen))
+ continue;
+
+ snprintf(obuf, sizeof(obuf), "(%s, %s)",
+ format_host(AF_INET, 4, &msrc.data[0], sbuf, sizeof(sbuf)),
+ format_host(AF_INET, 4, &maddr.data[0], mbuf, sizeof(mbuf)));
+
+ fprintf(ofp, "%-32s Iif: ", obuf);
+
+ if (vifi == -1)
+ fprintf(ofp, "unresolved ");
+ else
+ fprintf(ofp, "%-10s ", viftable[vifi]);
+
+ if (oiflist[0]) {
+ char *next = NULL;
+ char *p = oiflist;
+ int ovifi, ottl;
+
+ fprintf(ofp, "Oifs: ");
+
+ while (p) {
+ next = strchr(p, ' ');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+ if (sscanf(p, "%d:%d", &ovifi, &ottl)<2) {
+ p = next;
+ continue;
+ }
+ p = next;
+
+ fprintf(ofp, "%s", viftable[ovifi]);
+ if (ottl>1)
+ fprintf(ofp, "(ttl %d) ", ovifi);
+ else
+ fprintf(ofp, " ");
+ }
+ }
+
+ if (show_stats && b) {
+ fprintf(ofp, "%s %u packets, %u bytes", _SL_, pkts, b);
+ if (w)
+ fprintf(ofp, ", %u arrived on wrong iif.", w);
+ }
+ fprintf(ofp, "\n");
+ }
+ fclose(fp);
+}
+
+
+static int mroute_list(int argc, char **argv)
+{
+ while (argc > 0) {
+ if (strcmp(*argv, "iif") == 0) {
+ NEXT_ARG();
+ strncpy(filter_dev, *argv, sizeof(filter_dev)-1);
+ } else if (matches(*argv, "from") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.msrc, *argv, AF_INET);
+ } else {
+ if (strcmp(*argv, "to") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ get_prefix(&filter.mdst, *argv, AF_INET);
+ }
+ argv++; argc--;
+ }
+
+ read_viftable();
+ read_mroute_list(stdout);
+ return 0;
+}
+
+int do_multiroute(int argc, char **argv)
+{
+ if (argc < 1)
+ return mroute_list(0, NULL);
+#if 0
+ if (matches(*argv, "add") == 0)
+ return mroute_modify(RTM_NEWADDR, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return mroute_modify(RTM_DELADDR, argc-1, argv+1);
+ if (matches(*argv, "get") == 0)
+ return mroute_get(argc-1, argv+1);
+#endif
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return mroute_list(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip mroute help\".\n", *argv);
+ exit(-1);
+}
diff --git a/ip/ipneigh.c b/ip/ipneigh.c
index e69de29b..f8c27900 100644
--- a/ip/ipneigh.c
+++ b/ip/ipneigh.c
@@ -0,0 +1,484 @@
+/*
+ * ipneigh.c "ip neigh".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *
+ * Changes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+#define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
+
+static struct
+{
+ int family;
+ int index;
+ int state;
+ int unused_only;
+ inet_prefix pfx;
+ int flushed;
+ char *flushb;
+ int flushp;
+ int flushe;
+ struct rtnl_handle *rth;
+} filter;
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip neigh { add | del | change | replace } { ADDR [ lladdr LLADDR ]\n"
+ " [ nud { permanent | noarp | stale | reachable } ]\n"
+ " | proxy ADDR } [ dev DEV ]\n");
+ fprintf(stderr, " ip neigh {show|flush} [ to PREFIX ] [ dev DEV ] [ nud STATE ]\n");
+ exit(-1);
+}
+
+int nud_state_a2n(unsigned *state, char *arg)
+{
+ if (matches(arg, "permanent") == 0)
+ *state = NUD_PERMANENT;
+ else if (matches(arg, "reachable") == 0)
+ *state = NUD_REACHABLE;
+ else if (strcmp(arg, "noarp") == 0)
+ *state = NUD_NOARP;
+ else if (strcmp(arg, "none") == 0)
+ *state = NUD_NONE;
+ else if (strcmp(arg, "stale") == 0)
+ *state = NUD_STALE;
+ else if (strcmp(arg, "incomplete") == 0)
+ *state = NUD_INCOMPLETE;
+ else if (strcmp(arg, "delay") == 0)
+ *state = NUD_DELAY;
+ else if (strcmp(arg, "probe") == 0)
+ *state = NUD_PROBE;
+ else if (matches(arg, "failed") == 0)
+ *state = NUD_FAILED;
+ else {
+ if (get_unsigned(state, arg, 0))
+ return -1;
+ if (*state>=0x100 || (*state&((*state)-1)))
+ return -1;
+ }
+ return 0;
+}
+
+char * nud_state_n2a(__u8 state, char *buf, int len)
+{
+ switch (state) {
+ case NUD_NONE:
+ return "none";
+ case NUD_INCOMPLETE:
+ return "incomplete";
+ case NUD_REACHABLE:
+ return "reachable";
+ case NUD_STALE:
+ return "stale";
+ case NUD_DELAY:
+ return "delay";
+ case NUD_PROBE:
+ return "probe";
+ case NUD_FAILED:
+ return "failed";
+ case NUD_NOARP:
+ return "noarp";
+ case NUD_PERMANENT:
+ return "permanent";
+ default:
+ snprintf(buf, len, "%x", state);
+ return buf;
+ }
+}
+
+static int flush_update(void)
+{
+ if (rtnl_send(filter.rth, filter.flushb, filter.flushp) < 0) {
+ perror("Failed to send flush request\n");
+ return -1;
+ }
+ filter.flushp = 0;
+ return 0;
+}
+
+
+static int ipneigh_modify(int cmd, int flags, int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct ndmsg ndm;
+ char buf[256];
+ } req;
+ char *d = NULL;
+ int dst_ok = 0;
+ int lladdr_ok = 0;
+ char * lla = NULL;
+ inet_prefix dst;
+
+ memset(&req, 0, sizeof(req));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+ req.n.nlmsg_type = cmd;
+ req.ndm.ndm_family = preferred_family;
+ req.ndm.ndm_state = NUD_PERMANENT;
+
+ while (argc > 0) {
+ if (matches(*argv, "lladdr") == 0) {
+ NEXT_ARG();
+ if (lladdr_ok)
+ duparg("lladdr", *argv);
+ lla = *argv;
+ lladdr_ok = 1;
+ } else if (strcmp(*argv, "nud") == 0) {
+ unsigned state;
+ NEXT_ARG();
+ if (nud_state_a2n(&state, *argv))
+ invarg("nud state is bad", *argv);
+ req.ndm.ndm_state = state;
+ } else if (matches(*argv, "proxy") == 0) {
+ NEXT_ARG();
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (dst_ok)
+ duparg("address", *argv);
+ get_addr(&dst, *argv, preferred_family);
+ dst_ok = 1;
+ req.ndm.ndm_flags |= NTF_PROXY;
+ } else if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ d = *argv;
+ } else {
+ if (strcmp(*argv, "to") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0) {
+ NEXT_ARG();
+ }
+ if (dst_ok)
+ duparg2("to", *argv);
+ get_addr(&dst, *argv, preferred_family);
+ dst_ok = 1;
+ }
+ argc--; argv++;
+ }
+ if (d == NULL || !dst_ok || dst.family == AF_UNSPEC) {
+ fprintf(stderr, "Device and destination are required arguments.\n");
+ exit(-1);
+ }
+ req.ndm.ndm_family = dst.family;
+ addattr_l(&req.n, sizeof(req), NDA_DST, &dst.data, dst.bytelen);
+
+ if (lla && strcmp(lla, "null")) {
+ __u8 llabuf[16];
+ int l;
+
+ l = ll_addr_a2n(llabuf, sizeof(llabuf), lla);
+ addattr_l(&req.n, sizeof(req), NDA_LLADDR, llabuf, l);
+ }
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ ll_init_map(&rth);
+
+ if ((req.ndm.ndm_ifindex = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ return -1;
+ }
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ exit(2);
+
+ exit(0);
+}
+
+
+int print_neigh(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct ndmsg *r = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * tb[NDA_MAX+1];
+ char abuf[256];
+
+ if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH) {
+ fprintf(stderr, "Not RTM_NEWNEIGH: %08x %08x %08x\n",
+ n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+
+ return 0;
+ }
+ len -= NLMSG_LENGTH(sizeof(*r));
+ if (len < 0) {
+ fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+ return -1;
+ }
+
+ if (filter.flushb && n->nlmsg_type != RTM_NEWNEIGH)
+ return 0;
+
+ if (filter.family && filter.family != r->ndm_family)
+ return 0;
+ if (filter.index && filter.index != r->ndm_ifindex)
+ return 0;
+ if (!(filter.state&r->ndm_state) &&
+ (r->ndm_state || !(filter.state&0x100)) &&
+ (r->ndm_family != AF_DECnet))
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, NDA_MAX, NDA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
+
+ if (tb[NDA_DST]) {
+ if (filter.pfx.family) {
+ inet_prefix dst;
+ memset(&dst, 0, sizeof(dst));
+ dst.family = r->ndm_family;
+ memcpy(&dst.data, RTA_DATA(tb[NDA_DST]), RTA_PAYLOAD(tb[NDA_DST]));
+ if (inet_addr_match(&dst, &filter.pfx, filter.pfx.bitlen))
+ return 0;
+ }
+ }
+ if (filter.unused_only && tb[NDA_CACHEINFO]) {
+ struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]);
+ if (ci->ndm_refcnt)
+ return 0;
+ }
+
+ if (filter.flushb) {
+ struct nlmsghdr *fn;
+ if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) {
+ if (flush_update())
+ return -1;
+ }
+ fn = (struct nlmsghdr*)(filter.flushb + NLMSG_ALIGN(filter.flushp));
+ memcpy(fn, n, n->nlmsg_len);
+ fn->nlmsg_type = RTM_DELNEIGH;
+ fn->nlmsg_flags = NLM_F_REQUEST;
+ fn->nlmsg_seq = ++filter.rth->seq;
+ filter.flushp = (((char*)fn) + n->nlmsg_len) - filter.flushb;
+ filter.flushed++;
+ if (show_stats < 2)
+ return 0;
+ }
+
+ if (tb[NDA_DST]) {
+ fprintf(fp, "%s ",
+ format_host(r->ndm_family,
+ RTA_PAYLOAD(tb[NDA_DST]),
+ RTA_DATA(tb[NDA_DST]),
+ abuf, sizeof(abuf)));
+ }
+ if (!filter.index && r->ndm_ifindex)
+ fprintf(fp, "dev %s ", ll_index_to_name(r->ndm_ifindex));
+ if (tb[NDA_LLADDR]) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "lladdr %s", ll_addr_n2a(RTA_DATA(tb[NDA_LLADDR]),
+ RTA_PAYLOAD(tb[NDA_LLADDR]),
+ ll_index_to_type(r->ndm_ifindex),
+ b1, sizeof(b1)));
+ }
+ if (r->ndm_flags & NTF_ROUTER) {
+ fprintf(fp, " router");
+ }
+ if (tb[NDA_CACHEINFO] && show_stats) {
+ static int hz;
+ struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]);
+ if (!hz)
+ hz = get_hz();
+ if (ci->ndm_refcnt)
+ printf(" ref %d", ci->ndm_refcnt);
+ fprintf(fp, " used %d/%d/%d", ci->ndm_used/hz,
+ ci->ndm_confirmed/hz, ci->ndm_updated/hz);
+ }
+
+ if (r->ndm_state) {
+ SPRINT_BUF(b1);
+ fprintf(fp, " nud %s", nud_state_n2a(r->ndm_state, b1, sizeof(b1)));
+ }
+ fprintf(fp, "\n");
+
+ fflush(fp);
+ return 0;
+}
+
+void ipneigh_reset_filter()
+{
+ memset(&filter, 0, sizeof(filter));
+ filter.state = ~0;
+}
+
+int do_show_or_flush(int argc, char **argv, int flush)
+{
+ char *filter_dev = NULL;
+ struct rtnl_handle rth;
+ int state_given = 0;
+
+ ipneigh_reset_filter();
+
+ if (!filter.family)
+ filter.family = preferred_family;
+
+ if (flush) {
+ if (argc <= 0) {
+ fprintf(stderr, "Flush requires arguments.\n");
+ return -1;
+ }
+ filter.state = ~(NUD_PERMANENT|NUD_NOARP);
+ } else
+ filter.state = 0xFF & ~NUD_NOARP;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (filter_dev)
+ duparg("dev", *argv);
+ filter_dev = *argv;
+ } else if (strcmp(*argv, "unused") == 0) {
+ filter.unused_only = 1;
+ } else if (strcmp(*argv, "nud") == 0) {
+ unsigned state;
+ NEXT_ARG();
+ if (!state_given) {
+ state_given = 1;
+ filter.state = 0;
+ }
+ if (nud_state_a2n(&state, *argv)) {
+ if (strcmp(*argv, "all") != 0)
+ invarg("nud state is bad", *argv);
+ state = ~0;
+ if (flush)
+ state &= ~NUD_NOARP;
+ }
+ if (state == 0)
+ state = 0x100;
+ filter.state |= state;
+ } else {
+ if (strcmp(*argv, "to") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ get_prefix(&filter.pfx, *argv, filter.family);
+ if (filter.family == AF_UNSPEC)
+ filter.family = filter.pfx.family;
+ }
+ argc--; argv++;
+ }
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ ll_init_map(&rth);
+
+ if (filter_dev) {
+ if ((filter.index = ll_name_to_index(filter_dev)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", filter_dev);
+ return -1;
+ }
+ }
+
+ if (flush) {
+ int round = 0;
+ char flushb[4096-512];
+
+ filter.flushb = flushb;
+ filter.flushp = 0;
+ filter.flushe = sizeof(flushb);
+ filter.rth = &rth;
+ filter.state &= ~NUD_FAILED;
+
+ for (;;) {
+ if (rtnl_wilddump_request(&rth, filter.family, RTM_GETNEIGH) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+ filter.flushed = 0;
+ if (rtnl_dump_filter(&rth, print_neigh, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Flush terminated\n");
+ exit(1);
+ }
+ if (filter.flushed == 0) {
+ if (round == 0) {
+ fprintf(stderr, "Nothing to flush.\n");
+ } else if (show_stats)
+ printf("*** Flush is complete after %d round%s ***\n", round, round>1?"s":"");
+ fflush(stdout);
+ return 0;
+ }
+ round++;
+ if (flush_update() < 0)
+ exit(1);
+ if (show_stats) {
+ printf("\n*** Round %d, deleting %d entries ***\n", round, filter.flushed);
+ fflush(stdout);
+ }
+ }
+ }
+
+ if (rtnl_wilddump_request(&rth, filter.family, RTM_GETNEIGH) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, print_neigh, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ return 0;
+}
+
+int do_ipneigh(int argc, char **argv)
+{
+ if (argc > 0) {
+ if (matches(*argv, "add") == 0)
+ return ipneigh_modify(RTM_NEWNEIGH, NLM_F_CREATE|NLM_F_EXCL, argc-1, argv+1);
+ if (matches(*argv, "change") == 0 ||
+ strcmp(*argv, "chg") == 0)
+ return ipneigh_modify(RTM_NEWNEIGH, NLM_F_REPLACE, argc-1, argv+1);
+ if (matches(*argv, "replace") == 0)
+ return ipneigh_modify(RTM_NEWNEIGH, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return ipneigh_modify(RTM_DELNEIGH, 0, argc-1, argv+1);
+ if (matches(*argv, "get") == 0) {
+ fprintf(stderr, "Sorry, \"neigh get\" is not implemented :-(\n");
+ return -1;
+ }
+ if (matches(*argv, "show") == 0 ||
+ matches(*argv, "lst") == 0 ||
+ matches(*argv, "list") == 0)
+ return do_show_or_flush(argc-1, argv+1, 0);
+ if (matches(*argv, "flush") == 0)
+ return do_show_or_flush(argc-1, argv+1, 1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ } else
+ return do_show_or_flush(0, NULL, 0);
+
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip neigh help\".\n", *argv);
+ exit(-1);
+}
diff --git a/ip/iproute.c b/ip/iproute.c
index e69de29b..404f8e0e 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -0,0 +1,1410 @@
+/*
+ * iproute.c "ip route".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *
+ * Changes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses
+ * Kunihiro Ishiguro <kunihiro@zebra.org> 001102: rtnh_ifindex was not initialized
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <arpa/inet.h>
+#include <linux/in_route.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+#ifndef RTAX_RTTVAR
+#define RTAX_RTTVAR RTAX_HOPS
+#endif
+
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip route { list | flush } SELECTOR\n");
+ fprintf(stderr, " ip route get ADDRESS [ from ADDRESS iif STRING ]\n");
+ fprintf(stderr, " [ oif STRING ] [ tos TOS ]\n");
+ fprintf(stderr, " ip route { add | del | change | append | replace | monitor } ROUTE\n");
+ fprintf(stderr, "SELECTOR := [ root PREFIX ] [ match PREFIX ] [ exact PREFIX ]\n");
+ fprintf(stderr, " [ table TABLE_ID ] [ proto RTPROTO ]\n");
+ fprintf(stderr, " [ type TYPE ] [ scope SCOPE ]\n");
+ fprintf(stderr, "ROUTE := NODE_SPEC [ INFO_SPEC ]\n");
+ fprintf(stderr, "NODE_SPEC := [ TYPE ] PREFIX [ tos TOS ]\n");
+ fprintf(stderr, " [ table TABLE_ID ] [ proto RTPROTO ]\n");
+ fprintf(stderr, " [ scope SCOPE ] [ metric METRIC ]\n");
+ fprintf(stderr, "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n");
+ fprintf(stderr, "NH := [ via ADDRESS ] [ dev STRING ] [ weight NUMBER ] NHFLAGS\n");
+ fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ]\n");
+ fprintf(stderr, " [ rtt NUMBER ] [ rttvar NUMBER ]\n");
+ fprintf(stderr, " [ window NUMBER] [ cwnd NUMBER ] [ ssthresh REALM ]\n");
+ fprintf(stderr, " [ realms REALM ]\n");
+ fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n");
+ fprintf(stderr, " unreachable | prohibit | blackhole | nat ]\n");
+ fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n");
+ fprintf(stderr, "SCOPE := [ host | link | global | NUMBER ]\n");
+ fprintf(stderr, "FLAGS := [ equalize ]\n");
+ fprintf(stderr, "NHFLAGS := [ onlink | pervasive ]\n");
+ fprintf(stderr, "RTPROTO := [ kernel | boot | static | NUMBER ]\n");
+ exit(-1);
+}
+
+
+static struct
+{
+ int tb;
+ int flushed;
+ char *flushb;
+ int flushp;
+ int flushe;
+ struct rtnl_handle *rth;
+ int protocol, protocolmask;
+ int scope, scopemask;
+ int type, typemask;
+ int tos, tosmask;
+ int iif, iifmask;
+ int oif, oifmask;
+ int realm, realmmask;
+ inet_prefix rprefsrc;
+ inet_prefix rvia;
+ inet_prefix rdst;
+ inet_prefix mdst;
+ inet_prefix rsrc;
+ inet_prefix msrc;
+} filter;
+
+static int flush_update(void)
+{
+ if (rtnl_send(filter.rth, filter.flushb, filter.flushp) < 0) {
+ perror("Failed to send flush request\n");
+ return -1;
+ }
+ filter.flushp = 0;
+ return 0;
+}
+
+int print_route(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct rtmsg *r = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * tb[RTA_MAX+1];
+ char abuf[256];
+ inet_prefix dst;
+ inet_prefix src;
+ inet_prefix prefsrc;
+ inet_prefix via;
+ int host_len = -1;
+ SPRINT_BUF(b1);
+
+
+ if (n->nlmsg_type != RTM_NEWROUTE && n->nlmsg_type != RTM_DELROUTE) {
+ fprintf(stderr, "Not a route: %08x %08x %08x\n",
+ n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+ return 0;
+ }
+ if (filter.flushb && n->nlmsg_type != RTM_NEWROUTE)
+ return 0;
+ len -= NLMSG_LENGTH(sizeof(*r));
+ if (len < 0) {
+ fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+ return -1;
+ }
+
+ if (r->rtm_family == AF_INET6)
+ host_len = 128;
+ else if (r->rtm_family == AF_INET)
+ host_len = 32;
+ else if (r->rtm_family == AF_DECnet)
+ host_len = 16;
+ else if (r->rtm_family == AF_IPX)
+ host_len = 80;
+
+ if (r->rtm_family == AF_INET6) {
+ if (filter.tb) {
+ if (filter.tb < 0) {
+ if (!(r->rtm_flags&RTM_F_CLONED))
+ return 0;
+ } else {
+ if (r->rtm_flags&RTM_F_CLONED)
+ return 0;
+ if (filter.tb == RT_TABLE_LOCAL) {
+ if (r->rtm_type != RTN_LOCAL)
+ return 0;
+ } else if (filter.tb == RT_TABLE_MAIN) {
+ if (r->rtm_type == RTN_LOCAL)
+ return 0;
+ } else {
+ return 0;
+ }
+ }
+ }
+ } else {
+ if (filter.tb > 0 && filter.tb != r->rtm_table)
+ return 0;
+ }
+ if ((filter.protocol^r->rtm_protocol)&filter.protocolmask)
+ return 0;
+ if ((filter.scope^r->rtm_scope)&filter.scopemask)
+ return 0;
+ if ((filter.type^r->rtm_type)&filter.typemask)
+ return 0;
+ if ((filter.tos^r->rtm_tos)&filter.tosmask)
+ return 0;
+ if (filter.rdst.family &&
+ (r->rtm_family != filter.rdst.family || filter.rdst.bitlen > r->rtm_dst_len))
+ return 0;
+ if (filter.mdst.family &&
+ (r->rtm_family != filter.mdst.family ||
+ (filter.mdst.bitlen >= 0 && filter.mdst.bitlen < r->rtm_dst_len)))
+ return 0;
+ if (filter.rsrc.family &&
+ (r->rtm_family != filter.rsrc.family || filter.rsrc.bitlen > r->rtm_src_len))
+ return 0;
+ if (filter.msrc.family &&
+ (r->rtm_family != filter.msrc.family ||
+ (filter.msrc.bitlen >= 0 && filter.msrc.bitlen < r->rtm_src_len)))
+ return 0;
+ if (filter.rvia.family && r->rtm_family != filter.rvia.family)
+ return 0;
+ if (filter.rprefsrc.family && r->rtm_family != filter.rprefsrc.family)
+ return 0;
+
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
+
+ memset(&dst, 0, sizeof(dst));
+ dst.family = r->rtm_family;
+ if (tb[RTA_DST])
+ memcpy(&dst.data, RTA_DATA(tb[RTA_DST]), (r->rtm_dst_len+7)/8);
+ if (filter.rsrc.family || filter.msrc.family) {
+ memset(&src, 0, sizeof(src));
+ src.family = r->rtm_family;
+ if (tb[RTA_SRC])
+ memcpy(&src.data, RTA_DATA(tb[RTA_SRC]), (r->rtm_src_len+7)/8);
+ }
+ if (filter.rvia.bitlen>0) {
+ memset(&via, 0, sizeof(via));
+ via.family = r->rtm_family;
+ if (tb[RTA_GATEWAY])
+ memcpy(&via.data, RTA_DATA(tb[RTA_GATEWAY]), host_len);
+ }
+ if (filter.rprefsrc.bitlen>0) {
+ memset(&prefsrc, 0, sizeof(prefsrc));
+ prefsrc.family = r->rtm_family;
+ if (tb[RTA_PREFSRC])
+ memcpy(&prefsrc.data, RTA_DATA(tb[RTA_PREFSRC]), host_len);
+ }
+
+ if (filter.rdst.family && inet_addr_match(&dst, &filter.rdst, filter.rdst.bitlen))
+ return 0;
+ if (filter.mdst.family && filter.mdst.bitlen >= 0 &&
+ inet_addr_match(&dst, &filter.mdst, r->rtm_dst_len))
+ return 0;
+
+ if (filter.rsrc.family && inet_addr_match(&src, &filter.rsrc, filter.rsrc.bitlen))
+ return 0;
+ if (filter.msrc.family && filter.msrc.bitlen >= 0 &&
+ inet_addr_match(&src, &filter.msrc, r->rtm_src_len))
+ return 0;
+
+ if (filter.rvia.family && inet_addr_match(&via, &filter.rvia, filter.rvia.bitlen))
+ return 0;
+ if (filter.rprefsrc.family && inet_addr_match(&prefsrc, &filter.rprefsrc, filter.rprefsrc.bitlen))
+ return 0;
+ if (filter.realmmask) {
+ __u32 realms = 0;
+ if (tb[RTA_FLOW])
+ realms = *(__u32*)RTA_DATA(tb[RTA_FLOW]);
+ if ((realms^filter.realm)&filter.realmmask)
+ return 0;
+ }
+ if (filter.iifmask) {
+ int iif = 0;
+ if (tb[RTA_IIF])
+ iif = *(int*)RTA_DATA(tb[RTA_IIF]);
+ if ((iif^filter.iif)&filter.iifmask)
+ return 0;
+ }
+ if (filter.oifmask) {
+ int oif = 0;
+ if (tb[RTA_OIF])
+ oif = *(int*)RTA_DATA(tb[RTA_OIF]);
+ if ((oif^filter.oif)&filter.oifmask)
+ return 0;
+ }
+ if (filter.flushb &&
+ r->rtm_family == AF_INET6 &&
+ r->rtm_dst_len == 0 &&
+ r->rtm_type == RTN_UNREACHABLE &&
+ tb[RTA_PRIORITY] &&
+ *(int*)RTA_DATA(tb[RTA_PRIORITY]) == -1)
+ return 0;
+
+ if (filter.flushb) {
+ struct nlmsghdr *fn;
+ if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) {
+ if (flush_update())
+ return -1;
+ }
+ fn = (struct nlmsghdr*)(filter.flushb + NLMSG_ALIGN(filter.flushp));
+ memcpy(fn, n, n->nlmsg_len);
+ fn->nlmsg_type = RTM_DELROUTE;
+ fn->nlmsg_flags = NLM_F_REQUEST;
+ fn->nlmsg_seq = ++filter.rth->seq;
+ filter.flushp = (((char*)fn) + n->nlmsg_len) - filter.flushb;
+ filter.flushed++;
+ if (show_stats < 2)
+ return 0;
+ }
+
+ if (n->nlmsg_type == RTM_DELROUTE)
+ fprintf(fp, "Deleted ");
+ if (r->rtm_type != RTN_UNICAST && !filter.type)
+ fprintf(fp, "%s ", rtnl_rtntype_n2a(r->rtm_type, b1, sizeof(b1)));
+
+ if (tb[RTA_DST]) {
+ if (r->rtm_dst_len != host_len) {
+ fprintf(fp, "%s/%u ", rt_addr_n2a(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_DST]),
+ RTA_DATA(tb[RTA_DST]),
+ abuf, sizeof(abuf)),
+ r->rtm_dst_len
+ );
+ } else {
+ fprintf(fp, "%s ", format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_DST]),
+ RTA_DATA(tb[RTA_DST]),
+ abuf, sizeof(abuf))
+ );
+ }
+ } else if (r->rtm_dst_len) {
+ fprintf(fp, "0/%d ", r->rtm_dst_len);
+ } else {
+ fprintf(fp, "default ");
+ }
+ if (tb[RTA_SRC]) {
+ if (r->rtm_src_len != host_len) {
+ fprintf(fp, "from %s/%u ", rt_addr_n2a(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_SRC]),
+ RTA_DATA(tb[RTA_SRC]),
+ abuf, sizeof(abuf)),
+ r->rtm_src_len
+ );
+ } else {
+ fprintf(fp, "from %s ", format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_SRC]),
+ RTA_DATA(tb[RTA_SRC]),
+ abuf, sizeof(abuf))
+ );
+ }
+ } else if (r->rtm_src_len) {
+ fprintf(fp, "from 0/%u ", r->rtm_src_len);
+ }
+ if (r->rtm_tos && filter.tosmask != -1) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "tos %s ", rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1)));
+ }
+ if (tb[RTA_GATEWAY] && filter.rvia.bitlen != host_len) {
+ fprintf(fp, "via %s ",
+ format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_GATEWAY]),
+ RTA_DATA(tb[RTA_GATEWAY]),
+ abuf, sizeof(abuf)));
+ }
+ if (tb[RTA_OIF] && filter.oifmask != -1)
+ fprintf(fp, "dev %s ", ll_index_to_name(*(int*)RTA_DATA(tb[RTA_OIF])));
+
+ if (!(r->rtm_flags&RTM_F_CLONED)) {
+ if (r->rtm_table != RT_TABLE_MAIN && !filter.tb)
+ fprintf(fp, " table %s ", rtnl_rttable_n2a(r->rtm_table, b1, sizeof(b1)));
+ if (r->rtm_protocol != RTPROT_BOOT && filter.protocolmask != -1)
+ fprintf(fp, " proto %s ", rtnl_rtprot_n2a(r->rtm_protocol, b1, sizeof(b1)));
+ if (r->rtm_scope != RT_SCOPE_UNIVERSE && filter.scopemask != -1)
+ fprintf(fp, " scope %s ", rtnl_rtscope_n2a(r->rtm_scope, b1, sizeof(b1)));
+ }
+ if (tb[RTA_PREFSRC] && filter.rprefsrc.bitlen != host_len) {
+ /* Do not use format_host(). It is our local addr
+ and symbolic name will not be useful.
+ */
+ fprintf(fp, " src %s ",
+ rt_addr_n2a(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_PREFSRC]),
+ RTA_DATA(tb[RTA_PREFSRC]),
+ abuf, sizeof(abuf)));
+ }
+ if (tb[RTA_PRIORITY])
+ fprintf(fp, " metric %d ", *(__u32*)RTA_DATA(tb[RTA_PRIORITY]));
+ if (r->rtm_flags & RTNH_F_DEAD)
+ fprintf(fp, "dead ");
+ if (r->rtm_flags & RTNH_F_ONLINK)
+ fprintf(fp, "onlink ");
+ if (r->rtm_flags & RTNH_F_PERVASIVE)
+ fprintf(fp, "pervasive ");
+ if (r->rtm_flags & RTM_F_EQUALIZE)
+ fprintf(fp, "equalize ");
+ if (r->rtm_flags & RTM_F_NOTIFY)
+ fprintf(fp, "notify ");
+
+ if (tb[RTA_FLOW] && filter.realmmask != ~0U) {
+ __u32 to = *(__u32*)RTA_DATA(tb[RTA_FLOW]);
+ __u32 from = to>>16;
+ to &= 0xFFFF;
+ fprintf(fp, "realm%s ", from ? "s" : "");
+ if (from) {
+ fprintf(fp, "%s/",
+ rtnl_rtrealm_n2a(from, b1, sizeof(b1)));
+ }
+ fprintf(fp, "%s ",
+ rtnl_rtrealm_n2a(to, b1, sizeof(b1)));
+ }
+ if ((r->rtm_flags&RTM_F_CLONED) && r->rtm_family == AF_INET) {
+ __u32 flags = r->rtm_flags&~0xFFFF;
+ int first = 1;
+
+ fprintf(fp, "%s cache ", _SL_);
+
+#define PRTFL(fl,flname) if (flags&RTCF_##fl) { \
+ flags &= ~RTCF_##fl; \
+ fprintf(fp, "%s" flname "%s", first ? "<" : "", flags ? "," : "> "); \
+ first = 0; }
+ PRTFL(LOCAL, "local");
+ PRTFL(REJECT, "reject");
+ PRTFL(MULTICAST, "mc");
+ PRTFL(BROADCAST, "brd");
+ PRTFL(DNAT, "dst-nat");
+ PRTFL(SNAT, "src-nat");
+ PRTFL(MASQ, "masq");
+ PRTFL(DIRECTDST, "dst-direct");
+ PRTFL(DIRECTSRC, "src-direct");
+ PRTFL(REDIRECTED, "redirected");
+ PRTFL(DOREDIRECT, "redirect");
+ PRTFL(FAST, "fastroute");
+ PRTFL(NOTIFY, "notify");
+ PRTFL(TPROXY, "proxy");
+#ifdef RTCF_EQUALIZE
+ PRTFL(EQUALIZE, "equalize");
+#endif
+ if (flags)
+ fprintf(fp, "%s%x> ", first ? "<" : "", flags);
+ if (tb[RTA_CACHEINFO]) {
+ struct rta_cacheinfo *ci = RTA_DATA(tb[RTA_CACHEINFO]);
+ static int hz;
+ if (!hz)
+ hz = get_hz();
+ if (ci->rta_expires != 0)
+ fprintf(fp, " expires %dsec", ci->rta_expires/hz);
+ if (ci->rta_error != 0)
+ fprintf(fp, " error %d", ci->rta_error);
+ if (show_stats) {
+ if (ci->rta_clntref)
+ fprintf(fp, " users %d", ci->rta_clntref);
+ if (ci->rta_used != 0)
+ fprintf(fp, " used %d", ci->rta_used);
+ if (ci->rta_lastuse != 0)
+ fprintf(fp, " age %dsec", ci->rta_lastuse/hz);
+ }
+#ifdef RTNETLINK_HAVE_PEERINFO
+ if (ci->rta_id)
+ fprintf(fp, " ipid 0x%04x", ci->rta_id);
+ if (ci->rta_ts || ci->rta_tsage)
+ fprintf(fp, " ts 0x%x tsage %dsec", ci->rta_ts, ci->rta_tsage);
+#endif
+ }
+ } else if (r->rtm_family == AF_INET6) {
+ struct rta_cacheinfo *ci = NULL;
+ if (tb[RTA_CACHEINFO])
+ ci = RTA_DATA(tb[RTA_CACHEINFO]);
+ if ((r->rtm_flags & RTM_F_CLONED) || (ci && ci->rta_expires)) {
+ static int hz;
+ if (!hz)
+ hz = get_hz();
+ if (r->rtm_flags & RTM_F_CLONED)
+ fprintf(fp, "%s cache ", _SL_);
+ if (ci->rta_expires)
+ fprintf(fp, " expires %dsec", ci->rta_expires/hz);
+ if (ci->rta_error != 0)
+ fprintf(fp, " error %d", ci->rta_error);
+ if (show_stats) {
+ if (ci->rta_clntref)
+ fprintf(fp, " users %d", ci->rta_clntref);
+ if (ci->rta_used != 0)
+ fprintf(fp, " used %d", ci->rta_used);
+ if (ci->rta_lastuse != 0)
+ fprintf(fp, " age %dsec", ci->rta_lastuse/hz);
+ }
+ } else if (ci) {
+ if (ci->rta_error != 0)
+ fprintf(fp, " error %d", ci->rta_error);
+ }
+ }
+ if (tb[RTA_METRICS]) {
+ int i;
+ unsigned mxlock = 0;
+ struct rtattr *mxrta[RTAX_MAX+1];
+
+ memset(mxrta, 0, sizeof(mxrta));
+
+ parse_rtattr(mxrta, RTAX_MAX, RTA_DATA(tb[RTA_METRICS]),
+ RTA_PAYLOAD(tb[RTA_METRICS]));
+ if (mxrta[RTAX_LOCK])
+ mxlock = *(unsigned*)RTA_DATA(mxrta[RTAX_LOCK]);
+
+ for (i=2; i<=RTAX_MAX; i++) {
+ static char *mx_names[] =
+ {
+ "mtu",
+ "window",
+ "rtt",
+ "rttvar",
+ "ssthresh",
+ "cwnd",
+ "advmss",
+ "reordering",
+ };
+ static int hz;
+ if (mxrta[i] == NULL)
+ continue;
+ if (!hz)
+ hz = get_hz();
+ if (i-2 < sizeof(mx_names)/sizeof(char*))
+ fprintf(fp, " %s", mx_names[i-2]);
+ else
+ fprintf(fp, " metric%d", i);
+ if (mxlock & (1<<i))
+ fprintf(fp, " lock");
+
+ if (i != RTAX_RTT && i != RTAX_RTTVAR)
+ fprintf(fp, " %u", *(unsigned*)RTA_DATA(mxrta[i]));
+ else {
+ unsigned val = *(unsigned*)RTA_DATA(mxrta[i]);
+
+ val *= 1000;
+ if (i == RTAX_RTT)
+ val /= 8;
+ else
+ val /= 4;
+ if (val >= hz)
+ fprintf(fp, " %ums", val/hz);
+ else
+ fprintf(fp, " %.2fms", (float)val/hz);
+ }
+ }
+ }
+ if (tb[RTA_IIF] && filter.iifmask != -1) {
+ fprintf(fp, " iif %s", ll_index_to_name(*(int*)RTA_DATA(tb[RTA_IIF])));
+ }
+ if (tb[RTA_MULTIPATH]) {
+ struct rtnexthop *nh = RTA_DATA(tb[RTA_MULTIPATH]);
+ int first = 0;
+
+ len = RTA_PAYLOAD(tb[RTA_MULTIPATH]);
+
+ for (;;) {
+ if (len < sizeof(*nh))
+ break;
+ if (nh->rtnh_len > len)
+ break;
+ if (r->rtm_flags&RTM_F_CLONED && r->rtm_type == RTN_MULTICAST) {
+ if (first)
+ fprintf(fp, " Oifs:");
+ else
+ fprintf(fp, " ");
+ } else
+ fprintf(fp, "%s\tnexthop", _SL_);
+ if (nh->rtnh_len > sizeof(*nh)) {
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, RTA_MAX, RTNH_DATA(nh), nh->rtnh_len - sizeof(*nh));
+ if (tb[RTA_GATEWAY]) {
+ fprintf(fp, " via %s ",
+ format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_GATEWAY]),
+ RTA_DATA(tb[RTA_GATEWAY]),
+ abuf, sizeof(abuf)));
+ }
+ }
+ if (r->rtm_flags&RTM_F_CLONED && r->rtm_type == RTN_MULTICAST) {
+ fprintf(fp, " %s", ll_index_to_name(nh->rtnh_ifindex));
+ if (nh->rtnh_hops != 1)
+ fprintf(fp, "(ttl>%d)", nh->rtnh_hops);
+ } else {
+ fprintf(fp, " dev %s", ll_index_to_name(nh->rtnh_ifindex));
+ fprintf(fp, " weight %d", nh->rtnh_hops+1);
+ }
+ if (nh->rtnh_flags & RTNH_F_DEAD)
+ fprintf(fp, " dead");
+ if (nh->rtnh_flags & RTNH_F_ONLINK)
+ fprintf(fp, " onlink");
+ if (nh->rtnh_flags & RTNH_F_PERVASIVE)
+ fprintf(fp, " pervasive");
+ len -= NLMSG_ALIGN(nh->rtnh_len);
+ nh = RTNH_NEXT(nh);
+ }
+ }
+ fprintf(fp, "\n");
+ fflush(fp);
+ return 0;
+}
+
+
+int parse_one_nh(struct rtattr *rta, struct rtnexthop *rtnh, int *argcp, char ***argvp)
+{
+ int argc = *argcp;
+ char **argv = *argvp;
+
+ while (++argv, --argc > 0) {
+ if (strcmp(*argv, "via") == 0) {
+ NEXT_ARG();
+ rta_addattr32(rta, 4096, RTA_GATEWAY, get_addr32(*argv));
+ rtnh->rtnh_len += sizeof(struct rtattr) + 4;
+ } else if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if ((rtnh->rtnh_ifindex = ll_name_to_index(*argv)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", *argv);
+ exit(1);
+ }
+ } else if (strcmp(*argv, "weight") == 0) {
+ unsigned w;
+ NEXT_ARG();
+ if (get_unsigned(&w, *argv, 0) || w == 0 || w > 256)
+ invarg("\"weight\" is invalid\n", *argv);
+ rtnh->rtnh_hops = w - 1;
+ } else if (strcmp(*argv, "onlink") == 0) {
+ rtnh->rtnh_flags |= RTNH_F_ONLINK;
+ } else
+ break;
+ }
+ *argcp = argc;
+ *argvp = argv;
+ return 0;
+}
+
+int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r, int argc, char **argv)
+{
+ char buf[1024];
+ struct rtattr *rta = (void*)buf;
+ struct rtnexthop *rtnh;
+
+ rta->rta_type = RTA_MULTIPATH;
+ rta->rta_len = RTA_LENGTH(0);
+ rtnh = RTA_DATA(rta);
+
+ while (argc > 0) {
+ if (strcmp(*argv, "nexthop") != 0) {
+ fprintf(stderr, "Error: \"nexthop\" or end of line is expected instead of \"%s\"\n", *argv);
+ exit(-1);
+ }
+ if (argc <= 1) {
+ fprintf(stderr, "Error: unexpected end of line after \"nexthop\"\n");
+ exit(-1);
+ }
+ memset(rtnh, 0, sizeof(*rtnh));
+ rtnh->rtnh_len = sizeof(*rtnh);
+ rta->rta_len += rtnh->rtnh_len;
+ parse_one_nh(rta, rtnh, &argc, &argv);
+ rtnh = RTNH_NEXT(rtnh);
+ }
+
+ if (rta->rta_len > RTA_LENGTH(0))
+ addattr_l(n, 1024, RTA_MULTIPATH, RTA_DATA(rta), RTA_PAYLOAD(rta));
+ return 0;
+}
+
+
+int iproute_modify(int cmd, unsigned flags, int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct rtmsg r;
+ char buf[1024];
+ } req;
+ char mxbuf[256];
+ struct rtattr * mxrta = (void*)mxbuf;
+ unsigned mxlock = 0;
+ char *d = NULL;
+ int gw_ok = 0;
+ int dst_ok = 0;
+ int nhs_ok = 0;
+ int scope_ok = 0;
+ int table_ok = 0;
+ int proto_ok = 0;
+ int type_ok = 0;
+
+ memset(&req, 0, sizeof(req));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+ req.n.nlmsg_type = cmd;
+ req.r.rtm_family = preferred_family;
+ req.r.rtm_table = RT_TABLE_MAIN;
+ req.r.rtm_scope = RT_SCOPE_NOWHERE;
+
+ if (cmd != RTM_DELROUTE) {
+ req.r.rtm_protocol = RTPROT_BOOT;
+ req.r.rtm_scope = RT_SCOPE_UNIVERSE;
+ req.r.rtm_type = RTN_UNICAST;
+ }
+
+ mxrta->rta_type = RTA_METRICS;
+ mxrta->rta_len = RTA_LENGTH(0);
+
+ while (argc > 0) {
+ if (strcmp(*argv, "src") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ get_addr(&addr, *argv, req.r.rtm_family);
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = addr.family;
+ addattr_l(&req.n, sizeof(req), RTA_PREFSRC, &addr.data, addr.bytelen);
+ } else if (strcmp(*argv, "via") == 0) {
+ inet_prefix addr;
+ gw_ok = 1;
+ NEXT_ARG();
+ get_addr(&addr, *argv, req.r.rtm_family);
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = addr.family;
+ addattr_l(&req.n, sizeof(req), RTA_GATEWAY, &addr.data, addr.bytelen);
+ } else if (strcmp(*argv, "from") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ get_prefix(&addr, *argv, req.r.rtm_family);
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = addr.family;
+ if (addr.bytelen)
+ addattr_l(&req.n, sizeof(req), RTA_SRC, &addr.data, addr.bytelen);
+ req.r.rtm_src_len = addr.bitlen;
+ } else if (strcmp(*argv, "tos") == 0 ||
+ matches(*argv, "dsfield") == 0) {
+ __u32 tos;
+ NEXT_ARG();
+ if (rtnl_dsfield_a2n(&tos, *argv))
+ invarg("\"tos\" value is invalid\n", *argv);
+ req.r.rtm_tos = tos;
+ } else if (matches(*argv, "metric") == 0 ||
+ matches(*argv, "priority") == 0 ||
+ matches(*argv, "preference") == 0) {
+ __u32 metric;
+ NEXT_ARG();
+ if (get_u32(&metric, *argv, 0))
+ invarg("\"metric\" value is invalid\n", *argv);
+ addattr32(&req.n, sizeof(req), RTA_PRIORITY, metric);
+ } else if (strcmp(*argv, "scope") == 0) {
+ int scope = 0;
+ NEXT_ARG();
+ if (rtnl_rtscope_a2n(&scope, *argv))
+ invarg("invalid \"scope\" value\n", *argv);
+ req.r.rtm_scope = scope;
+ scope_ok = 1;
+ } else if (strcmp(*argv, "mtu") == 0) {
+ unsigned mtu;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_MTU);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&mtu, *argv, 0))
+ invarg("\"mtu\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_MTU, mtu);
+#ifdef RTAX_ADVMSS
+ } else if (strcmp(*argv, "advmss") == 0) {
+ unsigned mss;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_ADVMSS);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&mss, *argv, 0))
+ invarg("\"mss\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_ADVMSS, mss);
+#endif
+#ifdef RTAX_REORDERING
+ } else if (matches(*argv, "reordering") == 0) {
+ unsigned reord;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_REORDERING);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&reord, *argv, 0))
+ invarg("\"reordering\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_REORDERING, reord);
+#endif
+ } else if (strcmp(*argv, "rtt") == 0) {
+ unsigned rtt;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_RTT);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&rtt, *argv, 0))
+ invarg("\"rtt\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_RTT, rtt);
+ } else if (matches(*argv, "window") == 0) {
+ unsigned win;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_WINDOW);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&win, *argv, 0))
+ invarg("\"window\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_WINDOW, win);
+ } else if (matches(*argv, "cwnd") == 0) {
+ unsigned win;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_CWND);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&win, *argv, 0))
+ invarg("\"cwnd\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_CWND, win);
+ } else if (matches(*argv, "rttvar") == 0) {
+ unsigned win;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_RTTVAR);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&win, *argv, 0))
+ invarg("\"rttvar\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_RTTVAR, win);
+ } else if (matches(*argv, "ssthresh") == 0) {
+ unsigned win;
+ NEXT_ARG();
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_SSTHRESH);
+ NEXT_ARG();
+ }
+ if (get_unsigned(&win, *argv, 0))
+ invarg("\"ssthresh\" value is invalid\n", *argv);
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_SSTHRESH, win);
+ } else if (matches(*argv, "realms") == 0) {
+ __u32 realm;
+ NEXT_ARG();
+ if (get_rt_realms(&realm, *argv))
+ invarg("\"realm\" value is invalid\n", *argv);
+ addattr32(&req.n, sizeof(req), RTA_FLOW, realm);
+ } else if (strcmp(*argv, "onlink") == 0) {
+ req.r.rtm_flags |= RTNH_F_ONLINK;
+ } else if (matches(*argv, "equalize") == 0 ||
+ strcmp(*argv, "eql") == 0) {
+ req.r.rtm_flags |= RTM_F_EQUALIZE;
+ } else if (strcmp(*argv, "nexthop") == 0) {
+ nhs_ok = 1;
+ break;
+ } else if (matches(*argv, "protocol") == 0) {
+ int prot;
+ NEXT_ARG();
+ if (rtnl_rtprot_a2n(&prot, *argv))
+ invarg("\"protocol\" value is invalid\n", *argv);
+ req.r.rtm_protocol = prot;
+ proto_ok =1;
+ } else if (matches(*argv, "table") == 0) {
+ int tid;
+ NEXT_ARG();
+ if (rtnl_rttable_a2n(&tid, *argv))
+ invarg("\"table\" value is invalid\n", *argv);
+ req.r.rtm_table = tid;
+ table_ok = 1;
+ } else if (strcmp(*argv, "dev") == 0 ||
+ strcmp(*argv, "oif") == 0) {
+ NEXT_ARG();
+ d = *argv;
+ } else {
+ int type;
+ inet_prefix dst;
+
+ if (strcmp(*argv, "to") == 0) {
+ NEXT_ARG();
+ }
+ if ((**argv < '0' || **argv > '9') &&
+ rtnl_rtntype_a2n(&type, *argv) == 0) {
+ NEXT_ARG();
+ req.r.rtm_type = type;
+ type_ok = 1;
+ }
+
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (dst_ok)
+ duparg2("to", *argv);
+ get_prefix(&dst, *argv, req.r.rtm_family);
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = dst.family;
+ req.r.rtm_dst_len = dst.bitlen;
+ dst_ok = 1;
+ if (dst.bytelen)
+ addattr_l(&req.n, sizeof(req), RTA_DST, &dst.data, dst.bytelen);
+ }
+ argc--; argv++;
+ }
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ if (d || nhs_ok) {
+ int idx;
+
+ ll_init_map(&rth);
+
+ if (d) {
+ if ((idx = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ return -1;
+ }
+ addattr32(&req.n, sizeof(req), RTA_OIF, idx);
+ }
+ }
+
+ if (mxrta->rta_len > RTA_LENGTH(0)) {
+ if (mxlock)
+ rta_addattr32(mxrta, sizeof(mxbuf), RTAX_LOCK, mxlock);
+ addattr_l(&req.n, sizeof(req), RTA_METRICS, RTA_DATA(mxrta), RTA_PAYLOAD(mxrta));
+ }
+
+ if (nhs_ok)
+ parse_nexthops(&req.n, &req.r, argc, argv);
+
+ if (!table_ok) {
+ if (req.r.rtm_type == RTN_LOCAL ||
+ req.r.rtm_type == RTN_BROADCAST ||
+ req.r.rtm_type == RTN_NAT ||
+ req.r.rtm_type == RTN_ANYCAST)
+ req.r.rtm_table = RT_TABLE_LOCAL;
+ }
+ if (!scope_ok) {
+ if (req.r.rtm_type == RTN_LOCAL ||
+ req.r.rtm_type == RTN_NAT)
+ req.r.rtm_scope = RT_SCOPE_HOST;
+ else if (req.r.rtm_type == RTN_BROADCAST ||
+ req.r.rtm_type == RTN_MULTICAST ||
+ req.r.rtm_type == RTN_ANYCAST)
+ req.r.rtm_scope = RT_SCOPE_LINK;
+ else if (req.r.rtm_type == RTN_UNICAST ||
+ req.r.rtm_type == RTN_UNSPEC) {
+ if (cmd == RTM_DELROUTE)
+ req.r.rtm_scope = RT_SCOPE_NOWHERE;
+ else if (!gw_ok && !nhs_ok)
+ req.r.rtm_scope = RT_SCOPE_LINK;
+ }
+ }
+
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = AF_INET;
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ exit(2);
+
+ return 0;
+}
+
+static int rtnl_rtcache_request(struct rtnl_handle *rth, int family)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+ struct sockaddr_nl nladdr;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ memset(&req, 0, sizeof(req));
+ nladdr.nl_family = AF_NETLINK;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = RTM_GETROUTE;
+ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = rth->dump = ++rth->seq;
+ req.rtm.rtm_family = family;
+ req.rtm.rtm_flags |= RTM_F_CLONED;
+
+ return sendto(rth->fd, (void*)&req, sizeof(req), 0, (struct sockaddr*)&nladdr, sizeof(nladdr));
+}
+
+static int iproute_flush_cache(void)
+{
+#define ROUTE_FLUSH_PATH "/proc/sys/net/ipv4/route/flush"
+
+ int len;
+ int flush_fd = open (ROUTE_FLUSH_PATH, O_WRONLY);
+ char *buffer = "-1";
+
+ if (flush_fd < 0) {
+ fprintf (stderr, "Cannot open \"%s\"\n", ROUTE_FLUSH_PATH);
+ return -1;
+ }
+
+ len = strlen (buffer);
+
+ if ((write (flush_fd, (void *)buffer, len)) < len) {
+ fprintf (stderr, "Cannot flush routing cache\n");
+ return -1;
+ }
+ close(flush_fd);
+ return 0;
+}
+
+
+static int iproute_list_or_flush(int argc, char **argv, int flush)
+{
+ int do_ipv6 = preferred_family;
+ struct rtnl_handle rth;
+ char *id = NULL;
+ char *od = NULL;
+
+ iproute_reset_filter();
+ filter.tb = RT_TABLE_MAIN;
+
+ if (flush && argc <= 0) {
+ fprintf(stderr, "\"ip route flush\" requires arguments.\n");
+ return -1;
+ }
+
+ while (argc > 0) {
+ if (matches(*argv, "table") == 0) {
+ int tid;
+ NEXT_ARG();
+ if (rtnl_rttable_a2n(&tid, *argv)) {
+ if (strcmp(*argv, "all") == 0) {
+ tid = 0;
+ } else if (strcmp(*argv, "cache") == 0) {
+ tid = -1;
+ } else if (strcmp(*argv, "help") == 0) {
+ usage();
+ } else {
+ invarg("table id value is invalid\n", *argv);
+ }
+ }
+ filter.tb = tid;
+ } else if (matches(*argv, "cached") == 0 ||
+ matches(*argv, "cloned") == 0) {
+ filter.tb = -1;
+ } else if (strcmp(*argv, "tos") == 0 ||
+ matches(*argv, "dsfield") == 0) {
+ __u32 tos;
+ NEXT_ARG();
+ if (rtnl_dsfield_a2n(&tos, *argv))
+ invarg("TOS value is invalid\n", *argv);
+ filter.tos = tos;
+ filter.tosmask = -1;
+ } else if (matches(*argv, "protocol") == 0) {
+ int prot = 0;
+ NEXT_ARG();
+ filter.protocolmask = -1;
+ if (rtnl_rtprot_a2n(&prot, *argv)) {
+ if (strcmp(*argv, "all") != 0)
+ invarg("invalid \"protocol\"\n", *argv);
+ prot = 0;
+ filter.protocolmask = 0;
+ }
+ filter.protocol = prot;
+ } else if (matches(*argv, "scope") == 0) {
+ int scope = 0;
+ NEXT_ARG();
+ filter.scopemask = -1;
+ if (rtnl_rtscope_a2n(&scope, *argv)) {
+ if (strcmp(*argv, "all") != 0)
+ invarg("invalid \"scope\"\n", *argv);
+ scope = RT_SCOPE_NOWHERE;
+ filter.scopemask = 0;
+ }
+ filter.scope = scope;
+ } else if (matches(*argv, "type") == 0) {
+ int type;
+ NEXT_ARG();
+ filter.typemask = -1;
+ if (rtnl_rtntype_a2n(&type, *argv))
+ invarg("node type value is invalid\n", *argv);
+ filter.type = type;
+ } else if (strcmp(*argv, "dev") == 0 ||
+ strcmp(*argv, "oif") == 0) {
+ NEXT_ARG();
+ od = *argv;
+ } else if (strcmp(*argv, "iif") == 0) {
+ NEXT_ARG();
+ id = *argv;
+ } else if (strcmp(*argv, "via") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.rvia, *argv, do_ipv6);
+ } else if (strcmp(*argv, "src") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.rprefsrc, *argv, do_ipv6);
+ } else if (matches(*argv, "realms") == 0) {
+ __u32 realm;
+ NEXT_ARG();
+ if (get_rt_realms(&realm, *argv))
+ invarg("invalid realms\n", *argv);
+ filter.realm = realm;
+ filter.realmmask = ~0U;
+ if ((filter.realm&0xFFFF) == 0 &&
+ (*argv)[strlen(*argv) - 1] == '/')
+ filter.realmmask &= ~0xFFFF;
+ if ((filter.realm&0xFFFF0000U) == 0 &&
+ (strchr(*argv, '/') == NULL ||
+ (*argv)[0] == '/'))
+ filter.realmmask &= ~0xFFFF0000U;
+ } else if (matches(*argv, "from") == 0) {
+ NEXT_ARG();
+ if (matches(*argv, "root") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.rsrc, *argv, do_ipv6);
+ } else if (matches(*argv, "match") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.msrc, *argv, do_ipv6);
+ } else {
+ if (matches(*argv, "exact") == 0) {
+ NEXT_ARG();
+ }
+ get_prefix(&filter.msrc, *argv, do_ipv6);
+ filter.rsrc = filter.msrc;
+ }
+ } else {
+ if (matches(*argv, "to") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "root") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.rdst, *argv, do_ipv6);
+ } else if (matches(*argv, "match") == 0) {
+ NEXT_ARG();
+ get_prefix(&filter.mdst, *argv, do_ipv6);
+ } else {
+ if (matches(*argv, "exact") == 0) {
+ NEXT_ARG();
+ }
+ get_prefix(&filter.mdst, *argv, do_ipv6);
+ filter.rdst = filter.mdst;
+ }
+ }
+ argc--; argv++;
+ }
+
+ if (do_ipv6 == AF_UNSPEC && filter.tb)
+ do_ipv6 = AF_INET;
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ ll_init_map(&rth);
+
+ if (id || od) {
+ int idx;
+
+ if (id) {
+ if ((idx = ll_name_to_index(id)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", id);
+ return -1;
+ }
+ filter.iif = idx;
+ filter.iifmask = -1;
+ }
+ if (od) {
+ if ((idx = ll_name_to_index(od)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", od);
+ return -1;
+ }
+ filter.oif = idx;
+ filter.oifmask = -1;
+ }
+ }
+
+ if (flush) {
+ int round = 0;
+ char flushb[4096-512];
+
+ if (filter.tb == -1) {
+ if (do_ipv6 != AF_INET6) {
+ iproute_flush_cache();
+ if (show_stats)
+ printf("*** IPv4 routing cache is flushed.\n");
+ }
+ if (do_ipv6 == AF_INET)
+ return 0;
+ }
+
+ filter.flushb = flushb;
+ filter.flushp = 0;
+ filter.flushe = sizeof(flushb);
+ filter.rth = &rth;
+
+ for (;;) {
+ if (rtnl_wilddump_request(&rth, do_ipv6, RTM_GETROUTE) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+ filter.flushed = 0;
+ if (rtnl_dump_filter(&rth, print_route, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Flush terminated\n");
+ exit(1);
+ }
+ if (filter.flushed == 0) {
+ if (round == 0) {
+ if (filter.tb != -1 || do_ipv6 == AF_INET6)
+ fprintf(stderr, "Nothing to flush.\n");
+ } else if (show_stats)
+ printf("*** Flush is complete after %d round%s ***\n", round, round>1?"s":"");
+ fflush(stdout);
+ return 0;
+ }
+ round++;
+ if (flush_update() < 0)
+ exit(1);
+ if (show_stats) {
+ printf("\n*** Round %d, deleting %d entries ***\n", round, filter.flushed);
+ fflush(stdout);
+ }
+ }
+ }
+
+ if (filter.tb != -1) {
+ if (rtnl_wilddump_request(&rth, do_ipv6, RTM_GETROUTE) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+ } else {
+ if (rtnl_rtcache_request(&rth, do_ipv6) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+ }
+
+ if (rtnl_dump_filter(&rth, print_route, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ exit(0);
+}
+
+
+int iproute_get(int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct rtmsg r;
+ char buf[1024];
+ } req;
+ char *idev = NULL;
+ char *odev = NULL;
+ int connected = 0;
+ int from_ok = 0;
+
+ memset(&req, 0, sizeof(req));
+
+ iproute_reset_filter();
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST;
+ req.n.nlmsg_type = RTM_GETROUTE;
+ req.r.rtm_family = preferred_family;
+ req.r.rtm_table = 0;
+ req.r.rtm_protocol = 0;
+ req.r.rtm_scope = 0;
+ req.r.rtm_type = 0;
+ req.r.rtm_src_len = 0;
+ req.r.rtm_dst_len = 0;
+ req.r.rtm_tos = 0;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "tos") == 0 ||
+ matches(*argv, "dsfield") == 0) {
+ __u32 tos;
+ NEXT_ARG();
+ if (rtnl_dsfield_a2n(&tos, *argv))
+ invarg("TOS value is invalid\n", *argv);
+ req.r.rtm_tos = tos;
+ } else if (matches(*argv, "from") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ if (matches(*argv, "help") == 0)
+ usage();
+ from_ok = 1;
+ get_prefix(&addr, *argv, req.r.rtm_family);
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = addr.family;
+ if (addr.bytelen)
+ addattr_l(&req.n, sizeof(req), RTA_SRC, &addr.data, addr.bytelen);
+ req.r.rtm_src_len = addr.bitlen;
+ } else if (matches(*argv, "iif") == 0) {
+ NEXT_ARG();
+ idev = *argv;
+ } else if (matches(*argv, "oif") == 0 ||
+ strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ odev = *argv;
+ } else if (matches(*argv, "notify") == 0) {
+ req.r.rtm_flags |= RTM_F_NOTIFY;
+ } else if (matches(*argv, "connected") == 0) {
+ connected = 1;
+ } else {
+ inet_prefix addr;
+ if (strcmp(*argv, "to") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ get_prefix(&addr, *argv, req.r.rtm_family);
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = addr.family;
+ if (addr.bytelen)
+ addattr_l(&req.n, sizeof(req), RTA_DST, &addr.data, addr.bytelen);
+ req.r.rtm_dst_len = addr.bitlen;
+ }
+ argc--; argv++;
+ }
+
+ if (req.r.rtm_dst_len == 0) {
+ fprintf(stderr, "need at least destination address\n");
+ exit(1);
+ }
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ ll_init_map(&rth);
+
+ if (idev || odev) {
+ int idx;
+
+ if (idev) {
+ if ((idx = ll_name_to_index(idev)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", idev);
+ return -1;
+ }
+ addattr32(&req.n, sizeof(req), RTA_IIF, idx);
+ }
+ if (odev) {
+ if ((idx = ll_name_to_index(odev)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", odev);
+ return -1;
+ }
+ addattr32(&req.n, sizeof(req), RTA_OIF, idx);
+ }
+ }
+
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = AF_INET;
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, &req.n, NULL, NULL) < 0)
+ exit(2);
+
+ if (connected && !from_ok) {
+ struct rtmsg *r = NLMSG_DATA(&req.n);
+ int len = req.n.nlmsg_len;
+ struct rtattr * tb[RTA_MAX+1];
+
+ if (print_route(NULL, &req.n, (void*)stdout) < 0) {
+ fprintf(stderr, "An error :-)\n");
+ exit(1);
+ }
+
+ if (req.n.nlmsg_type != RTM_NEWROUTE) {
+ fprintf(stderr, "Not a route?\n");
+ return -1;
+ }
+ len -= NLMSG_LENGTH(sizeof(*r));
+ if (len < 0) {
+ fprintf(stderr, "Wrong len %d\n", len);
+ return -1;
+ }
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
+
+ if (tb[RTA_PREFSRC]) {
+ tb[RTA_PREFSRC]->rta_type = RTA_SRC;
+ r->rtm_src_len = 8*RTA_PAYLOAD(tb[RTA_PREFSRC]);
+ } else if (!tb[RTA_SRC]) {
+ fprintf(stderr, "Failed to connect the route\n");
+ return -1;
+ }
+ if (!odev && tb[RTA_OIF])
+ tb[RTA_OIF]->rta_type = 0;
+ if (tb[RTA_GATEWAY])
+ tb[RTA_GATEWAY]->rta_type = 0;
+ if (!idev && tb[RTA_IIF])
+ tb[RTA_IIF]->rta_type = 0;
+ req.n.nlmsg_flags = NLM_F_REQUEST;
+ req.n.nlmsg_type = RTM_GETROUTE;
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, &req.n, NULL, NULL) < 0)
+ exit(2);
+ }
+
+ if (print_route(NULL, &req.n, (void*)stdout) < 0) {
+ fprintf(stderr, "An error :-)\n");
+ exit(1);
+ }
+
+ exit(0);
+}
+
+void iproute_reset_filter()
+{
+ memset(&filter, 0, sizeof(filter));
+ filter.mdst.bitlen = -1;
+ filter.msrc.bitlen = -1;
+}
+
+int do_iproute(int argc, char **argv)
+{
+ if (argc < 1)
+ return iproute_list_or_flush(0, NULL, 0);
+
+ if (matches(*argv, "add") == 0)
+ return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_EXCL,
+ argc-1, argv+1);
+ if (matches(*argv, "change") == 0 || strcmp(*argv, "chg") == 0)
+ return iproute_modify(RTM_NEWROUTE, NLM_F_REPLACE,
+ argc-1, argv+1);
+ if (matches(*argv, "replace") == 0)
+ return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_REPLACE,
+ argc-1, argv+1);
+ if (matches(*argv, "prepend") == 0)
+ return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE,
+ argc-1, argv+1);
+ if (matches(*argv, "append") == 0)
+ return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_APPEND,
+ argc-1, argv+1);
+ if (matches(*argv, "test") == 0)
+ return iproute_modify(RTM_NEWROUTE, NLM_F_EXCL,
+ argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return iproute_modify(RTM_DELROUTE, 0,
+ argc-1, argv+1);
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return iproute_list_or_flush(argc-1, argv+1, 0);
+ if (matches(*argv, "get") == 0)
+ return iproute_get(argc-1, argv+1);
+ if (matches(*argv, "flush") == 0)
+ return iproute_list_or_flush(argc-1, argv+1, 1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip route help\".\n", *argv);
+ exit(-1);
+}
+
diff --git a/ip/iprule.c b/ip/iprule.c
index e69de29b..457864f8 100644
--- a/ip/iprule.c
+++ b/ip/iprule.c
@@ -0,0 +1,323 @@
+/*
+ * iprule.c "ip rule".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *
+ * Changes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "rt_names.h"
+#include "utils.h"
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip rule [ list | add | del ] SELECTOR ACTION\n");
+ fprintf(stderr, "SELECTOR := [ from PREFIX ] [ to PREFIX ] [ tos TOS ] [ fwmark FWMARK ]\n");
+ fprintf(stderr, " [ dev STRING ] [ pref NUMBER ]\n");
+ fprintf(stderr, "ACTION := [ table TABLE_ID ] [ nat ADDRESS ]\n");
+ fprintf(stderr, " [ prohibit | reject | unreachable ]\n");
+ fprintf(stderr, " [ realms [SRCREALM/]DSTREALM ]\n");
+ fprintf(stderr, "TABLE_ID := [ local | main | default | NUMBER ]\n");
+ exit(-1);
+}
+
+int print_rule(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct rtmsg *r = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ int host_len = -1;
+ struct rtattr * tb[RTA_MAX+1];
+ char abuf[256];
+ SPRINT_BUF(b1);
+
+ if (n->nlmsg_type != RTM_NEWRULE)
+ return 0;
+
+ len -= NLMSG_LENGTH(sizeof(*r));
+ if (len < 0)
+ return -1;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
+
+ if (r->rtm_family == AF_INET)
+ host_len = 32;
+ else if (r->rtm_family == AF_INET6)
+ host_len = 128;
+ else if (r->rtm_family == AF_DECnet)
+ host_len = 16;
+ else if (r->rtm_family == AF_IPX)
+ host_len = 80;
+
+ if (tb[RTA_PRIORITY])
+ fprintf(fp, "%u:\t", *(unsigned*)RTA_DATA(tb[RTA_PRIORITY]));
+ else
+ fprintf(fp, "0:\t");
+
+ if (tb[RTA_SRC]) {
+ if (r->rtm_src_len != host_len) {
+ fprintf(fp, "from %s/%u ", rt_addr_n2a(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_SRC]),
+ RTA_DATA(tb[RTA_SRC]),
+ abuf, sizeof(abuf)),
+ r->rtm_src_len
+ );
+ } else {
+ fprintf(fp, "from %s ", format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_SRC]),
+ RTA_DATA(tb[RTA_SRC]),
+ abuf, sizeof(abuf))
+ );
+ }
+ } else if (r->rtm_src_len) {
+ fprintf(fp, "from 0/%d ", r->rtm_src_len);
+ } else {
+ fprintf(fp, "from all ");
+ }
+
+ if (tb[RTA_DST]) {
+ if (r->rtm_dst_len != host_len) {
+ fprintf(fp, "to %s/%u ", rt_addr_n2a(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_DST]),
+ RTA_DATA(tb[RTA_DST]),
+ abuf, sizeof(abuf)),
+ r->rtm_dst_len
+ );
+ } else {
+ fprintf(fp, "to %s ", format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_DST]),
+ RTA_DATA(tb[RTA_DST]),
+ abuf, sizeof(abuf)));
+ }
+ } else if (r->rtm_dst_len) {
+ fprintf(fp, "to 0/%d ", r->rtm_dst_len);
+ }
+
+ if (r->rtm_tos) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "tos %s ", rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1)));
+ }
+ if (tb[RTA_PROTOINFO]) {
+ fprintf(fp, "fwmark %8x ", *(__u32*)RTA_DATA(tb[RTA_PROTOINFO]));
+ }
+
+ if (tb[RTA_IIF]) {
+ fprintf(fp, "iif %s ", (char*)RTA_DATA(tb[RTA_IIF]));
+ }
+
+ if (r->rtm_table)
+ fprintf(fp, "lookup %s ", rtnl_rttable_n2a(r->rtm_table, b1, sizeof(b1)));
+
+ if (tb[RTA_FLOW]) {
+ __u32 to = *(__u32*)RTA_DATA(tb[RTA_FLOW]);
+ __u32 from = to>>16;
+ to &= 0xFFFF;
+ if (from) {
+ fprintf(fp, "realms %s/",
+ rtnl_rtrealm_n2a(from, b1, sizeof(b1)));
+ }
+ fprintf(fp, "%s ",
+ rtnl_rtrealm_n2a(to, b1, sizeof(b1)));
+ }
+
+ if (r->rtm_type == RTN_NAT) {
+ if (tb[RTA_GATEWAY]) {
+ fprintf(fp, "map-to %s ",
+ format_host(r->rtm_family,
+ RTA_PAYLOAD(tb[RTA_GATEWAY]),
+ RTA_DATA(tb[RTA_GATEWAY]),
+ abuf, sizeof(abuf)));
+ } else
+ fprintf(fp, "masquerade");
+ } else if (r->rtm_type != RTN_UNICAST)
+ fprintf(fp, "%s", rtnl_rtntype_n2a(r->rtm_type, b1, sizeof(b1)));
+
+ fprintf(fp, "\n");
+ fflush(fp);
+ return 0;
+}
+
+int iprule_list(int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ int af = preferred_family;
+
+ if (af == AF_UNSPEC)
+ af = AF_INET;
+
+ if (argc > 0) {
+ fprintf(stderr, "\"ip rule show\" need not eny arguments.\n");
+ return -1;
+ }
+
+ if (rtnl_open(&rth, 0) < 0)
+ return 1;
+
+ if (rtnl_wilddump_request(&rth, af, RTM_GETRULE) < 0) {
+ perror("Cannot send dump request");
+ return 1;
+ }
+
+ if (rtnl_dump_filter(&rth, print_rule, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int iprule_modify(int cmd, int argc, char **argv)
+{
+ int table_ok = 0;
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct rtmsg r;
+ char buf[1024];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+
+ req.n.nlmsg_type = cmd;
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST;
+ req.r.rtm_family = preferred_family;
+ req.r.rtm_protocol = RTPROT_BOOT;
+ req.r.rtm_scope = RT_SCOPE_UNIVERSE;
+ req.r.rtm_table = 0;
+ req.r.rtm_type = RTN_UNSPEC;
+
+ if (cmd == RTM_NEWRULE) {
+ req.n.nlmsg_flags |= NLM_F_CREATE|NLM_F_EXCL;
+ req.r.rtm_type = RTN_UNICAST;
+ }
+
+ while (argc > 0) {
+ if (strcmp(*argv, "from") == 0) {
+ inet_prefix dst;
+ NEXT_ARG();
+ get_prefix(&dst, *argv, req.r.rtm_family);
+ req.r.rtm_src_len = dst.bitlen;
+ addattr_l(&req.n, sizeof(req), RTA_SRC, &dst.data, dst.bytelen);
+ } else if (strcmp(*argv, "to") == 0) {
+ inet_prefix dst;
+ NEXT_ARG();
+ get_prefix(&dst, *argv, req.r.rtm_family);
+ req.r.rtm_dst_len = dst.bitlen;
+ addattr_l(&req.n, sizeof(req), RTA_DST, &dst.data, dst.bytelen);
+ } else if (matches(*argv, "preference") == 0 ||
+ matches(*argv, "order") == 0 ||
+ matches(*argv, "priority") == 0) {
+ __u32 pref;
+ NEXT_ARG();
+ if (get_u32(&pref, *argv, 0))
+ invarg("preference value is invalid\n", *argv);
+ addattr32(&req.n, sizeof(req), RTA_PRIORITY, pref);
+ } else if (strcmp(*argv, "tos") == 0) {
+ __u32 tos;
+ NEXT_ARG();
+ if (rtnl_dsfield_a2n(&tos, *argv))
+ invarg("TOS value is invalid\n", *argv);
+ req.r.rtm_tos = tos;
+ } else if (strcmp(*argv, "fwmark") == 0) {
+ __u32 fwmark;
+ NEXT_ARG();
+ if (get_u32(&fwmark, *argv, 16))
+ invarg("fwmark value is invalid\n", *argv);
+ addattr32(&req.n, sizeof(req), RTA_PROTOINFO, fwmark);
+ } else if (matches(*argv, "realms") == 0) {
+ __u32 realm;
+ NEXT_ARG();
+ if (get_rt_realms(&realm, *argv))
+ invarg("invalid realms\n", *argv);
+ addattr32(&req.n, sizeof(req), RTA_FLOW, realm);
+ } else if (matches(*argv, "table") == 0 ||
+ strcmp(*argv, "lookup") == 0) {
+ int tid;
+ NEXT_ARG();
+ if (rtnl_rttable_a2n(&tid, *argv))
+ invarg("invalid table ID\n", *argv);
+ req.r.rtm_table = tid;
+ table_ok = 1;
+ } else if (strcmp(*argv, "dev") == 0 ||
+ strcmp(*argv, "iif") == 0) {
+ NEXT_ARG();
+ addattr_l(&req.n, sizeof(req), RTA_IIF, *argv, strlen(*argv)+1);
+ } else if (strcmp(*argv, "nat") == 0 ||
+ matches(*argv, "map-to") == 0) {
+ NEXT_ARG();
+ addattr32(&req.n, sizeof(req), RTA_GATEWAY, get_addr32(*argv));
+ req.r.rtm_type = RTN_NAT;
+ } else {
+ int type;
+
+ if (strcmp(*argv, "type") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (rtnl_rtntype_a2n(&type, *argv))
+ invarg("Failed to parse rule type", *argv);
+ req.r.rtm_type = type;
+ }
+ argc--;
+ argv++;
+ }
+
+ if (req.r.rtm_family == AF_UNSPEC)
+ req.r.rtm_family = AF_INET;
+
+ if (!table_ok && cmd == RTM_NEWRULE)
+ req.r.rtm_table = RT_TABLE_MAIN;
+
+ if (rtnl_open(&rth, 0) < 0)
+ return 1;
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ return 2;
+
+ return 0;
+}
+
+int do_iprule(int argc, char **argv)
+{
+ if (argc < 1) {
+ return iprule_list(0, NULL);
+ } else if (matches(argv[0], "list") == 0 ||
+ matches(argv[0], "lst") == 0 ||
+ matches(argv[0], "show") == 0) {
+ return iprule_list(argc-1, argv+1);
+ } else if (matches(argv[0], "add") == 0) {
+ return iprule_modify(RTM_NEWRULE, argc-1, argv+1);
+ } else if (matches(argv[0], "delete") == 0) {
+ return iprule_modify(RTM_DELRULE, argc-1, argv+1);
+ } else if (matches(argv[0], "help") == 0)
+ usage();
+
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip rule help\".\n", *argv);
+ exit(-1);
+}
+
diff --git a/ip/iptunnel.c b/ip/iptunnel.c
index e69de29b..41c262b5 100644
--- a/ip/iptunnel.c
+++ b/ip/iptunnel.c
@@ -0,0 +1,581 @@
+/*
+ * iptunnel.c "ip tunnel"
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *
+ * Changes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses
+ * Rani Assaf <rani@magic.metawire.com> 980930: do not allow key for ipip/sit
+ * Phil Karn <karn@ka9q.ampr.org> 990408: "pmtudisc" flag
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <arpa/inet.h>
+#include <linux/if_tunnel.h>
+
+#include "rt_names.h"
+#include "utils.h"
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: ip tunnel { add | change | del | show } [ NAME ]\n");
+ fprintf(stderr, " [ mode { ipip | gre | sit } ] [ remote ADDR ] [ local ADDR ]\n");
+ fprintf(stderr, " [ [i|o]seq ] [ [i|o]key KEY ] [ [i|o]csum ]\n");
+ fprintf(stderr, " [ ttl TTL ] [ tos TOS ] [ [no]pmtudisc ] [ dev PHYS_DEV ]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Where: NAME := STRING\n");
+ fprintf(stderr, " ADDR := { IP_ADDRESS | any }\n");
+ fprintf(stderr, " TOS := { NUMBER | inherit }\n");
+ fprintf(stderr, " TTL := { 1..255 | inherit }\n");
+ fprintf(stderr, " KEY := { DOTTED_QUAD | NUMBER }\n");
+ exit(-1);
+}
+
+static int do_ioctl_get_ifindex(char *dev)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ strcpy(ifr.ifr_name, dev);
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ err = ioctl(fd, SIOCGIFINDEX, &ifr);
+ if (err) {
+ perror("ioctl");
+ return 0;
+ }
+ close(fd);
+ return ifr.ifr_ifindex;
+}
+
+static int do_ioctl_get_iftype(char *dev)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ strcpy(ifr.ifr_name, dev);
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ err = ioctl(fd, SIOCGIFHWADDR, &ifr);
+ if (err) {
+ perror("ioctl");
+ return -1;
+ }
+ close(fd);
+ return ifr.ifr_addr.sa_family;
+}
+
+
+static char * do_ioctl_get_ifname(int idx)
+{
+ static struct ifreq ifr;
+ int fd;
+ int err;
+
+ ifr.ifr_ifindex = idx;
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ err = ioctl(fd, SIOCGIFNAME, &ifr);
+ if (err) {
+ perror("ioctl");
+ return NULL;
+ }
+ close(fd);
+ return ifr.ifr_name;
+}
+
+
+
+static int do_get_ioctl(char *basedev, struct ip_tunnel_parm *p)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ strcpy(ifr.ifr_name, basedev);
+ ifr.ifr_ifru.ifru_data = (void*)p;
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ err = ioctl(fd, SIOCGETTUNNEL, &ifr);
+ if (err)
+ perror("ioctl");
+ close(fd);
+ return err;
+}
+
+static int do_add_ioctl(int cmd, char *basedev, struct ip_tunnel_parm *p)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ if (cmd == SIOCCHGTUNNEL && p->name[0])
+ strcpy(ifr.ifr_name, p->name);
+ else
+ strcpy(ifr.ifr_name, basedev);
+ ifr.ifr_ifru.ifru_data = (void*)p;
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ err = ioctl(fd, cmd, &ifr);
+ if (err)
+ perror("ioctl");
+ close(fd);
+ return err;
+}
+
+static int do_del_ioctl(char *basedev, struct ip_tunnel_parm *p)
+{
+ struct ifreq ifr;
+ int fd;
+ int err;
+
+ if (p->name[0])
+ strcpy(ifr.ifr_name, p->name);
+ else
+ strcpy(ifr.ifr_name, basedev);
+ ifr.ifr_ifru.ifru_data = (void*)p;
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ err = ioctl(fd, SIOCDELTUNNEL, &ifr);
+ if (err)
+ perror("ioctl");
+ close(fd);
+ return err;
+}
+
+static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p)
+{
+ int count = 0;
+ char medium[IFNAMSIZ];
+
+ memset(p, 0, sizeof(*p));
+ memset(&medium, 0, sizeof(medium));
+
+ p->iph.version = 4;
+ p->iph.ihl = 5;
+#ifndef IP_DF
+#define IP_DF 0x4000 /* Flag: "Don't Fragment" */
+#endif
+ p->iph.frag_off = htons(IP_DF);
+
+ while (argc > 0) {
+ if (strcmp(*argv, "mode") == 0) {
+ NEXT_ARG();
+ if (strcmp(*argv, "ipip") == 0 ||
+ strcmp(*argv, "ip/ip") == 0) {
+ if (p->iph.protocol && p->iph.protocol != IPPROTO_IPIP) {
+ fprintf(stderr,"You managed to ask for more than one tunnel mode.\n");
+ exit(-1);
+ }
+ p->iph.protocol = IPPROTO_IPIP;
+ } else if (strcmp(*argv, "gre") == 0 ||
+ strcmp(*argv, "gre/ip") == 0) {
+ if (p->iph.protocol && p->iph.protocol != IPPROTO_GRE) {
+ fprintf(stderr,"You managed to ask for more than one tunnel mode.\n");
+ exit(-1);
+ }
+ p->iph.protocol = IPPROTO_GRE;
+ } else if (strcmp(*argv, "sit") == 0 ||
+ strcmp(*argv, "ipv6/ip") == 0) {
+ if (p->iph.protocol && p->iph.protocol != IPPROTO_IPV6) {
+ fprintf(stderr,"You managed to ask for more than one tunnel mode.\n");
+ exit(-1);
+ }
+ p->iph.protocol = IPPROTO_IPV6;
+ } else {
+ fprintf(stderr,"Cannot guess tunnel mode.\n");
+ exit(-1);
+ }
+ } else if (strcmp(*argv, "key") == 0) {
+ unsigned uval;
+ NEXT_ARG();
+ p->i_flags |= GRE_KEY;
+ p->o_flags |= GRE_KEY;
+ if (strchr(*argv, '.'))
+ p->i_key = p->o_key = get_addr32(*argv);
+ else {
+ if (get_unsigned(&uval, *argv, 0)<0) {
+ fprintf(stderr, "invalid value of \"key\"\n");
+ exit(-1);
+ }
+ p->i_key = p->o_key = htonl(uval);
+ }
+ } else if (strcmp(*argv, "ikey") == 0) {
+ unsigned uval;
+ NEXT_ARG();
+ p->i_flags |= GRE_KEY;
+ if (strchr(*argv, '.'))
+ p->o_key = get_addr32(*argv);
+ else {
+ if (get_unsigned(&uval, *argv, 0)<0) {
+ fprintf(stderr, "invalid value of \"ikey\"\n");
+ exit(-1);
+ }
+ p->i_key = htonl(uval);
+ }
+ } else if (strcmp(*argv, "okey") == 0) {
+ unsigned uval;
+ NEXT_ARG();
+ p->o_flags |= GRE_KEY;
+ if (strchr(*argv, '.'))
+ p->o_key = get_addr32(*argv);
+ else {
+ if (get_unsigned(&uval, *argv, 0)<0) {
+ fprintf(stderr, "invalid value of \"okey\"\n");
+ exit(-1);
+ }
+ p->o_key = htonl(uval);
+ }
+ } else if (strcmp(*argv, "seq") == 0) {
+ p->i_flags |= GRE_SEQ;
+ p->o_flags |= GRE_SEQ;
+ } else if (strcmp(*argv, "iseq") == 0) {
+ p->i_flags |= GRE_SEQ;
+ } else if (strcmp(*argv, "oseq") == 0) {
+ p->o_flags |= GRE_SEQ;
+ } else if (strcmp(*argv, "csum") == 0) {
+ p->i_flags |= GRE_CSUM;
+ p->o_flags |= GRE_CSUM;
+ } else if (strcmp(*argv, "icsum") == 0) {
+ p->i_flags |= GRE_CSUM;
+ } else if (strcmp(*argv, "ocsum") == 0) {
+ p->o_flags |= GRE_CSUM;
+ } else if (strcmp(*argv, "nopmtudisc") == 0) {
+ p->iph.frag_off = 0;
+ } else if (strcmp(*argv, "pmtudisc") == 0) {
+ p->iph.frag_off = htons(IP_DF);
+ } else if (strcmp(*argv, "remote") == 0) {
+ NEXT_ARG();
+ if (strcmp(*argv, "any"))
+ p->iph.daddr = get_addr32(*argv);
+ } else if (strcmp(*argv, "local") == 0) {
+ NEXT_ARG();
+ if (strcmp(*argv, "any"))
+ p->iph.saddr = get_addr32(*argv);
+ } else if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ strncpy(medium, *argv, IFNAMSIZ-1);
+ } else if (strcmp(*argv, "ttl") == 0) {
+ unsigned uval;
+ NEXT_ARG();
+ if (strcmp(*argv, "inherit") != 0) {
+ if (get_unsigned(&uval, *argv, 0))
+ invarg("invalid TTL\n", *argv);
+ if (uval > 255)
+ invarg("TTL must be <=255\n", *argv);
+ p->iph.ttl = uval;
+ }
+ } else if (strcmp(*argv, "tos") == 0 ||
+ matches(*argv, "dsfield") == 0) {
+ __u32 uval;
+ NEXT_ARG();
+ if (strcmp(*argv, "inherit") != 0) {
+ if (rtnl_dsfield_a2n(&uval, *argv))
+ invarg("bad TOS value", *argv);
+ p->iph.tos = uval;
+ } else
+ p->iph.tos = 1;
+ } else {
+ if (strcmp(*argv, "name") == 0) {
+ NEXT_ARG();
+ }
+ if (matches(*argv, "help") == 0)
+ usage();
+ if (p->name[0])
+ duparg2("name", *argv);
+ strncpy(p->name, *argv, IFNAMSIZ);
+ if (cmd == SIOCCHGTUNNEL && count == 0) {
+ struct ip_tunnel_parm old_p;
+ memset(&old_p, 0, sizeof(old_p));
+ if (do_get_ioctl(*argv, &old_p))
+ return -1;
+ *p = old_p;
+ }
+ }
+ count++;
+ argc--; argv++;
+ }
+
+
+ if (p->iph.protocol == 0) {
+ if (memcmp(p->name, "gre", 3) == 0)
+ p->iph.protocol = IPPROTO_GRE;
+ else if (memcmp(p->name, "ipip", 4) == 0)
+ p->iph.protocol = IPPROTO_IPIP;
+ else if (memcmp(p->name, "sit", 3) == 0)
+ p->iph.protocol = IPPROTO_IPV6;
+ }
+
+ if (p->iph.protocol == IPPROTO_IPIP || p->iph.protocol == IPPROTO_IPV6) {
+ if ((p->i_flags & GRE_KEY) || (p->o_flags & GRE_KEY)) {
+ fprintf(stderr, "Keys are not allowed with ipip and sit.\n");
+ return -1;
+ }
+ }
+
+ if (medium[0]) {
+ p->link = do_ioctl_get_ifindex(medium);
+ if (p->link == 0)
+ return -1;
+ }
+
+ if (p->i_key == 0 && IN_MULTICAST(ntohl(p->iph.daddr))) {
+ p->i_key = p->iph.daddr;
+ p->i_flags |= GRE_KEY;
+ }
+ if (p->o_key == 0 && IN_MULTICAST(ntohl(p->iph.daddr))) {
+ p->o_key = p->iph.daddr;
+ p->o_flags |= GRE_KEY;
+ }
+ if (IN_MULTICAST(ntohl(p->iph.daddr)) && !p->iph.saddr) {
+ fprintf(stderr, "Broadcast tunnel requires a source address.\n");
+ return -1;
+ }
+ return 0;
+}
+
+
+static int do_add(int cmd, int argc, char **argv)
+{
+ struct ip_tunnel_parm p;
+
+ if (parse_args(argc, argv, cmd, &p) < 0)
+ return -1;
+
+ if (p.iph.ttl && p.iph.frag_off == 0) {
+ fprintf(stderr, "ttl != 0 and noptmudisc are incompatible\n");
+ return -1;
+ }
+
+ switch (p.iph.protocol) {
+ case IPPROTO_IPIP:
+ return do_add_ioctl(cmd, "tunl0", &p);
+ case IPPROTO_GRE:
+ return do_add_ioctl(cmd, "gre0", &p);
+ case IPPROTO_IPV6:
+ return do_add_ioctl(cmd, "sit0", &p);
+ default:
+ fprintf(stderr, "cannot determine tunnel mode (ipip, gre or sit)\n");
+ return -1;
+ }
+ return -1;
+}
+
+int do_del(int argc, char **argv)
+{
+ struct ip_tunnel_parm p;
+
+ if (parse_args(argc, argv, SIOCDELTUNNEL, &p) < 0)
+ return -1;
+
+ switch (p.iph.protocol) {
+ case IPPROTO_IPIP:
+ return do_del_ioctl("tunl0", &p);
+ case IPPROTO_GRE:
+ return do_del_ioctl("gre0", &p);
+ case IPPROTO_IPV6:
+ return do_del_ioctl("sit0", &p);
+ default:
+ return do_del_ioctl(p.name, &p);
+ }
+ return -1;
+}
+
+void print_tunnel(struct ip_tunnel_parm *p)
+{
+ char s1[1024];
+ char s2[1024];
+ char s3[64];
+ char s4[64];
+
+ inet_ntop(AF_INET, &p->i_key, s3, sizeof(s3));
+ inet_ntop(AF_INET, &p->o_key, s4, sizeof(s4));
+
+ /* Do not use format_host() for local addr,
+ * symbolic name will not be useful.
+ */
+ printf("%s: %s/ip remote %s local %s ",
+ p->name,
+ p->iph.protocol == IPPROTO_IPIP ? "ip" :
+ (p->iph.protocol == IPPROTO_GRE ? "gre" :
+ (p->iph.protocol == IPPROTO_IPV6 ? "ipv6" : "unknown")),
+ p->iph.daddr ? format_host(AF_INET, 4, &p->iph.daddr, s1, sizeof(s1)) : "any",
+ p->iph.saddr ? rt_addr_n2a(AF_INET, 4, &p->iph.saddr, s2, sizeof(s2)) : "any");
+
+ if (p->link) {
+ char *n = do_ioctl_get_ifname(p->link);
+ if (n)
+ printf(" dev %s ", n);
+ }
+
+ if (p->iph.ttl)
+ printf(" ttl %d ", p->iph.ttl);
+ else
+ printf(" ttl inherit ");
+
+ if (p->iph.tos) {
+ SPRINT_BUF(b1);
+ printf(" tos");
+ if (p->iph.tos&1)
+ printf(" inherit");
+ if (p->iph.tos&~1)
+ printf("%c%s ", p->iph.tos&1 ? '/' : ' ',
+ rtnl_dsfield_n2a(p->iph.tos&~1, b1, sizeof(b1)));
+ }
+
+ if (!(p->iph.frag_off&htons(IP_DF)))
+ printf(" nopmtudisc");
+
+ if ((p->i_flags&GRE_KEY) && (p->o_flags&GRE_KEY) && p->o_key == p->i_key)
+ printf(" key %s", s3);
+ else if ((p->i_flags|p->o_flags)&GRE_KEY) {
+ if (p->i_flags&GRE_KEY)
+ printf(" ikey %s ", s3);
+ if (p->o_flags&GRE_KEY)
+ printf(" okey %s ", s4);
+ }
+
+ if (p->i_flags&GRE_SEQ)
+ printf("%s Drop packets out of sequence.\n", _SL_);
+ if (p->i_flags&GRE_CSUM)
+ printf("%s Checksum in received packet is required.", _SL_);
+ if (p->o_flags&GRE_SEQ)
+ printf("%s Sequence packets on output.", _SL_);
+ if (p->o_flags&GRE_CSUM)
+ printf("%s Checksum output packets.", _SL_);
+}
+
+static int do_tunnels_list(struct ip_tunnel_parm *p)
+{
+ char name[IFNAMSIZ];
+ unsigned long rx_bytes, rx_packets, rx_errs, rx_drops,
+ rx_fifo, rx_frame,
+ tx_bytes, tx_packets, tx_errs, tx_drops,
+ tx_fifo, tx_colls, tx_carrier, rx_multi;
+ int type;
+ struct ip_tunnel_parm p1;
+
+ char buf[512];
+ FILE *fp = fopen("/proc/net/dev", "r");
+ if (fp == NULL) {
+ perror("fopen");
+ return -1;
+ }
+
+ fgets(buf, sizeof(buf), fp);
+ fgets(buf, sizeof(buf), fp);
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ char *ptr;
+ buf[sizeof(buf) - 1] = 0;
+ if ((ptr = strchr(buf, ':')) == NULL ||
+ (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) {
+ fprintf(stderr, "Wrong format of /proc/net/dev. Sorry.\n");
+ return -1;
+ }
+ if (sscanf(ptr, "%ld%ld%ld%ld%ld%ld%ld%*d%ld%ld%ld%ld%ld%ld%ld",
+ &rx_bytes, &rx_packets, &rx_errs, &rx_drops,
+ &rx_fifo, &rx_frame, &rx_multi,
+ &tx_bytes, &tx_packets, &tx_errs, &tx_drops,
+ &tx_fifo, &tx_colls, &tx_carrier) != 14)
+ continue;
+ if (p->name[0] && strcmp(p->name, name))
+ continue;
+ type = do_ioctl_get_iftype(name);
+ if (type == -1) {
+ fprintf(stderr, "Failed to get type of [%s]\n", name);
+ continue;
+ }
+ if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != ARPHRD_SIT)
+ continue;
+ memset(&p1, 0, sizeof(p1));
+ if (do_get_ioctl(name, &p1))
+ continue;
+ if ((p->link && p1.link != p->link) ||
+ (p->name[0] && strcmp(p1.name, p->name)) ||
+ (p->iph.daddr && p1.iph.daddr != p->iph.daddr) ||
+ (p->iph.saddr && p1.iph.saddr != p->iph.saddr) ||
+ (p->i_key && p1.i_key != p->i_key))
+ continue;
+ print_tunnel(&p1);
+ if (show_stats) {
+ printf("%s", _SL_);
+ printf("RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts%s", _SL_);
+ printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-8ld%s",
+ rx_packets, rx_bytes, rx_errs, rx_frame, rx_fifo, rx_multi, _SL_);
+ printf("TX: Packets Bytes Errors DeadLoop NoRoute NoBufs%s", _SL_);
+ printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-6ld",
+ tx_packets, tx_bytes, tx_errs, tx_colls, tx_carrier, tx_drops);
+ }
+ printf("\n");
+ }
+ return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+ int err;
+ struct ip_tunnel_parm p;
+
+ if (parse_args(argc, argv, SIOCGETTUNNEL, &p) < 0)
+ return -1;
+
+ switch (p.iph.protocol) {
+ case IPPROTO_IPIP:
+ err = do_get_ioctl(p.name[0] ? p.name : "tunl0", &p);
+ break;
+ case IPPROTO_GRE:
+ err = do_get_ioctl(p.name[0] ? p.name : "gre0", &p);
+ break;
+ case IPPROTO_IPV6:
+ err = do_get_ioctl(p.name[0] ? p.name : "sit0", &p);
+ break;
+ default:
+ do_tunnels_list(&p);
+ return 0;
+ }
+ if (err)
+ return -1;
+
+ print_tunnel(&p);
+ printf("\n");
+ return 0;
+}
+
+int do_iptunnel(int argc, char **argv)
+{
+ if (argc > 0) {
+ if (matches(*argv, "add") == 0)
+ return do_add(SIOCADDTUNNEL, argc-1, argv+1);
+ if (matches(*argv, "change") == 0)
+ return do_add(SIOCCHGTUNNEL, argc-1, argv+1);
+ if (matches(*argv, "del") == 0)
+ return do_del(argc-1, argv+1);
+ if (matches(*argv, "show") == 0 ||
+ matches(*argv, "lst") == 0 ||
+ matches(*argv, "list") == 0)
+ return do_show(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ } else
+ return do_show(0, NULL);
+
+ fprintf(stderr, "Command \"%s\" is unknown, try \"ip tunnel help\".\n", *argv);
+ exit(-1);
+}
diff --git a/ip/routef b/ip/routef
index e69de29b..db43b5df 100644
--- a/ip/routef
+++ b/ip/routef
@@ -0,0 +1,3 @@
+#! /bin/sh
+
+exec ip -4 ro flush scope global type unicast
diff --git a/ip/routel b/ip/routel
index e69de29b..8d1d352a 100644
--- a/ip/routel
+++ b/ip/routel
@@ -0,0 +1,60 @@
+#!/bin/sh
+#$Id$
+
+#
+# Script created by: Stephen R. van den Berg <srb@cuci.nl>, 1999/04/18
+# Donated to the public domain.
+#
+# This script transforms the output of "ip" into more readable text.
+# "ip" is the Linux-advanced-routing configuration tool part of the
+# iproute package.
+#
+
+test "X-h" = "X$1" && echo "Usage: $0 [tablenr [raw ip args...]]" && exit 64
+
+test -z "$*" && set 0
+
+ip route list table "$@" |
+ while read network rest
+ do set xx $rest
+ shift
+ proto=""
+ via=""
+ dev=""
+ scope=""
+ src=""
+ table=""
+ case $network in
+ broadcast|local|unreachable) via=$network
+ network=$1
+ shift
+ ;;
+ esac
+ while test $# != 0
+ do
+ key=$1
+ val=$2
+ eval "$key=$val"
+ shift 2
+ done
+ echo "$network $via $src $proto $scope $dev $table"
+ done | awk -F ' ' '
+BEGIN {
+ format="%15s%-3s %15s %15s %8s %8s%7s %s\n";
+ printf(format,"target","","gateway","source","proto","scope","dev","tbl");
+ }
+ { network=$1;
+ mask="";
+ if(match(network,"/"))
+ { mask=" "substr(network,RSTART+1);
+ network=substr(network,0,RSTART);
+ }
+ via=$2;
+ src=$3;
+ proto=$4;
+ scope=$5;
+ dev=$6;
+ table=$7;
+ printf(format,network,mask,via,src,proto,scope,dev,table);
+ }
+'
diff --git a/ip/rtm_map.c b/ip/rtm_map.c
index e69de29b..21e818b4 100644
--- a/ip/rtm_map.c
+++ b/ip/rtm_map.c
@@ -0,0 +1,116 @@
+/*
+ * rtm_map.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include "rt_names.h"
+#include "utils.h"
+
+char *rtnl_rtntype_n2a(int id, char *buf, int len)
+{
+ switch (id) {
+ case RTN_UNSPEC:
+ return "none";
+ case RTN_UNICAST:
+ return "unicast";
+ case RTN_LOCAL:
+ return "local";
+ case RTN_BROADCAST:
+ return "broadcast";
+ case RTN_ANYCAST:
+ return "anycast";
+ case RTN_MULTICAST:
+ return "multicast";
+ case RTN_BLACKHOLE:
+ return "blackhole";
+ case RTN_UNREACHABLE:
+ return "unreachable";
+ case RTN_PROHIBIT:
+ return "prohibit";
+ case RTN_THROW:
+ return "throw";
+ case RTN_NAT:
+ return "nat";
+ case RTN_XRESOLVE:
+ return "xresolve";
+ default:
+ snprintf(buf, len, "%d", id);
+ return buf;
+ }
+}
+
+
+int rtnl_rtntype_a2n(int *id, char *arg)
+{
+ char *end;
+ unsigned long res;
+
+ if (strcmp(arg, "local") == 0)
+ res = RTN_LOCAL;
+ else if (strcmp(arg, "nat") == 0)
+ res = RTN_NAT;
+ else if (matches(arg, "broadcast") == 0 ||
+ strcmp(arg, "brd") == 0)
+ res = RTN_BROADCAST;
+ else if (matches(arg, "anycast") == 0)
+ res = RTN_ANYCAST;
+ else if (matches(arg, "multicast") == 0)
+ res = RTN_MULTICAST;
+ else if (matches(arg, "prohibit") == 0)
+ res = RTN_PROHIBIT;
+ else if (matches(arg, "unreachable") == 0)
+ res = RTN_UNREACHABLE;
+ else if (matches(arg, "blackhole") == 0)
+ res = RTN_BLACKHOLE;
+ else if (matches(arg, "xresolve") == 0)
+ res = RTN_XRESOLVE;
+ else if (matches(arg, "unicast") == 0)
+ res = RTN_UNICAST;
+ else if (strcmp(arg, "throw") == 0)
+ res = RTN_THROW;
+ else {
+ res = strtoul(arg, &end, 0);
+ if (!end || end == arg || *end || res > 255)
+ return -1;
+ }
+ *id = res;
+ return 0;
+}
+
+int get_rt_realms(__u32 *realms, char *arg)
+{
+ __u32 realm = 0;
+ char *p = strchr(arg, '/');
+
+ *realms = 0;
+ if (p) {
+ *p = 0;
+ if (rtnl_rtrealm_a2n(realms, arg)) {
+ *p = '/';
+ return -1;
+ }
+ *realms <<= 16;
+ *p = '/';
+ arg = p+1;
+ }
+ if (*arg && rtnl_rtrealm_a2n(&realm, arg))
+ return -1;
+ *realms |= realm;
+ return 0;
+}
diff --git a/ip/rtmon.c b/ip/rtmon.c
index e69de29b..d01bc635 100644
--- a/ip/rtmon.c
+++ b/ip/rtmon.c
@@ -0,0 +1,177 @@
+/*
+ * rtmon.c RTnetlink listener.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <string.h>
+
+#include "SNAPSHOT.h"
+
+#include "utils.h"
+#include "libnetlink.h"
+
+int resolve_hosts = 0;
+static int init_phase = 1;
+
+static void write_stamp(FILE *fp)
+{
+ char buf[128];
+ struct nlmsghdr *n1 = (void*)buf;
+ struct timeval tv;
+
+ n1->nlmsg_type = 15;
+ n1->nlmsg_flags = 0;
+ n1->nlmsg_seq = 0;
+ n1->nlmsg_pid = 0;
+ n1->nlmsg_len = NLMSG_LENGTH(4*2);
+ gettimeofday(&tv, NULL);
+ ((__u32*)NLMSG_DATA(n1))[0] = tv.tv_sec;
+ ((__u32*)NLMSG_DATA(n1))[1] = tv.tv_usec;
+ fwrite((void*)n1, 1, NLMSG_ALIGN(n1->nlmsg_len), fp);
+}
+
+static int dump_msg(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ if (!init_phase)
+ write_stamp(fp);
+ fwrite((void*)n, 1, NLMSG_ALIGN(n->nlmsg_len), fp);
+ fflush(fp);
+ return 0;
+}
+
+void usage(void)
+{
+ fprintf(stderr, "Usage: rtmon file FILE [ all | LISTofOBJECTS]\n");
+ fprintf(stderr, "LISTofOBJECTS := [ link ] [ address ] [ route ]\n");
+ exit(-1);
+}
+
+int
+main(int argc, char **argv)
+{
+ FILE *fp;
+ struct rtnl_handle rth;
+ int family = AF_UNSPEC;
+ unsigned groups = ~0U;
+ int llink = 0;
+ int laddr = 0;
+ int lroute = 0;
+ char *file = NULL;
+
+ while (argc > 1) {
+ if (matches(argv[1], "-family") == 0) {
+ argc--;
+ argv++;
+ if (argc <= 1)
+ usage();
+ if (strcmp(argv[1], "inet") == 0)
+ family = AF_INET;
+ else if (strcmp(argv[1], "inet6") == 0)
+ family = AF_INET6;
+ else if (strcmp(argv[1], "link") == 0)
+ family = AF_INET6;
+ else if (strcmp(argv[1], "help") == 0)
+ usage();
+ else {
+ fprintf(stderr, "Protocol ID \"%s\" is unknown, try \"rtmon help\".\n", argv[1]);
+ exit(-1);
+ }
+ } else if (strcmp(argv[1], "-4") == 0) {
+ family = AF_INET;
+ } else if (strcmp(argv[1], "-6") == 0) {
+ family = AF_INET6;
+ } else if (strcmp(argv[1], "-0") == 0) {
+ family = AF_PACKET;
+ } else if (matches(argv[1], "-Version") == 0) {
+ printf("rtmon utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ } else if (matches(argv[1], "file") == 0) {
+ argc--;
+ argv++;
+ if (argc <= 1)
+ usage();
+ file = argv[1];
+ } else if (matches(argv[1], "link") == 0) {
+ llink=1;
+ groups = 0;
+ } else if (matches(argv[1], "address") == 0) {
+ laddr=1;
+ groups = 0;
+ } else if (matches(argv[1], "route") == 0) {
+ lroute=1;
+ groups = 0;
+ } else if (strcmp(argv[1], "all") == 0) {
+ groups = ~0U;
+ } else if (matches(argv[1], "help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, "Argument \"%s\" is unknown, try \"rtmon help\".\n", argv[1]);
+ exit(-1);
+ }
+ argc--; argv++;
+ }
+
+ if (file == NULL) {
+ fprintf(stderr, "Not enough information: argument \"file\" is required\n");
+ exit(-1);
+ }
+ if (llink)
+ groups |= RTMGRP_LINK;
+ if (laddr) {
+ if (!family || family == AF_INET)
+ groups |= RTMGRP_IPV4_IFADDR;
+ if (!family || family == AF_INET6)
+ groups |= RTMGRP_IPV6_IFADDR;
+ }
+ if (lroute) {
+ if (!family || family == AF_INET)
+ groups |= RTMGRP_IPV4_ROUTE;
+ if (!family || family == AF_INET6)
+ groups |= RTMGRP_IPV6_ROUTE;
+ }
+
+ fp = fopen(file, "w");
+ if (fp == NULL) {
+ perror("Cannot fopen");
+ exit(-1);
+ }
+
+ if (rtnl_open(&rth, groups) < 0)
+ exit(1);
+
+ if (rtnl_wilddump_request(&rth, AF_UNSPEC, RTM_GETLINK) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ write_stamp(fp);
+
+ if (rtnl_dump_filter(&rth, dump_msg, fp, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ return 1;
+ }
+
+ init_phase = 0;
+
+ if (rtnl_listen(&rth, dump_msg, (void*)fp) < 0)
+ exit(2);
+
+ exit(0);
+}
diff --git a/ip/rtpr b/ip/rtpr
index e69de29b..c3629fd6 100644
--- a/ip/rtpr
+++ b/ip/rtpr
@@ -0,0 +1,4 @@
+#! /bin/bash
+
+exec tr "[\\\\]" "[
+]"
diff --git a/lib/Makefile b/lib/Makefile
index e69de29b..bc270bff 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -0,0 +1,18 @@
+
+UTILOBJ=utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o inet_proto.o
+
+NLOBJ=ll_map.o libnetlink.o
+
+all: libnetlink.a libutil.a
+
+libnetlink.a: $(NLOBJ)
+ $(AR) rcs $@ $(NLOBJ)
+
+libutil.a: $(UTILOBJ) $(ADDLIB)
+ $(AR) rcs $@ $(UTILOBJ) $(ADDLIB)
+
+install:
+
+clean:
+ rm -f $(NLOBJ) $(UTILOBJ) $(ADDLIB) libnetlink.a libutil.a
+
diff --git a/lib/dnet_ntop.c b/lib/dnet_ntop.c
index e69de29b..9500df86 100644
--- a/lib/dnet_ntop.c
+++ b/lib/dnet_ntop.c
@@ -0,0 +1,98 @@
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include "utils.h"
+
+static __inline__ u_int16_t dn_ntohs(u_int16_t addr)
+{
+ union {
+ u_int8_t byte[2];
+ u_int16_t word;
+ } u;
+
+ u.word = addr;
+ return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8);
+}
+
+static __inline__ int do_digit(char *str, u_int16_t *addr, u_int16_t scale, size_t *pos, size_t len, int *started)
+{
+ u_int16_t tmp = *addr / scale;
+
+ if (*pos == len)
+ return 1;
+
+ if (((tmp) > 0) || *started || (scale == 1)) {
+ *str = tmp + '0';
+ *started = 1;
+ (*pos)++;
+ *addr -= (tmp * scale);
+ }
+
+ return 0;
+}
+
+
+static const char *dnet_ntop1(const struct dn_naddr *dna, char *str, size_t len)
+{
+ u_int16_t addr = dn_ntohs(*(u_int16_t *)dna->a_addr);
+ u_int16_t area = addr >> 10;
+ size_t pos = 0;
+ int started = 0;
+
+ if (dna->a_len != 2)
+ return NULL;
+
+ addr &= 0x03ff;
+
+ if (len == 0)
+ return str;
+
+ if (do_digit(str + pos, &area, 10, &pos, len, &started))
+ return str;
+
+ if (do_digit(str + pos, &area, 1, &pos, len, &started))
+ return str;
+
+ if (pos == len)
+ return str;
+
+ *(str + pos) = '.';
+ pos++;
+ started = 0;
+
+ if (do_digit(str + pos, &addr, 1000, &pos, len, &started))
+ return str;
+
+ if (do_digit(str + pos, &addr, 100, &pos, len, &started))
+ return str;
+
+ if (do_digit(str + pos, &addr, 10, &pos, len, &started))
+ return str;
+
+ if (do_digit(str + pos, &addr, 1, &pos, len, &started))
+ return str;
+
+ if (pos == len)
+ return str;
+
+ *(str + pos) = 0;
+
+ return str;
+}
+
+
+const char *dnet_ntop(int af, const void *addr, char *str, size_t len)
+{
+ switch(af) {
+ case AF_DECnet:
+ errno = 0;
+ return dnet_ntop1((struct dn_naddr *)addr, str, len);
+ default:
+ errno = EAFNOSUPPORT;
+ }
+
+ return NULL;
+}
+
+
diff --git a/lib/dnet_pton.c b/lib/dnet_pton.c
index e69de29b..bd7727ae 100644
--- a/lib/dnet_pton.c
+++ b/lib/dnet_pton.c
@@ -0,0 +1,71 @@
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include "utils.h"
+
+static __inline__ u_int16_t dn_htons(u_int16_t addr)
+{
+ union {
+ u_int8_t byte[2];
+ u_int16_t word;
+ } u;
+
+ u.word = addr;
+ return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8);
+}
+
+
+static int dnet_num(const char *src, u_int16_t * dst)
+{
+ int rv = 0;
+ int tmp;
+ *dst = 0;
+
+ while ((tmp = *src++) != 0) {
+ tmp -= '0';
+ if ((tmp < 0) || (tmp > 9))
+ return rv;
+
+ rv++;
+ (*dst) *= 10;
+ (*dst) += tmp;
+ }
+
+ return rv;
+}
+
+static int dnet_pton1(const char *src, struct dn_naddr *dna)
+{
+ u_int16_t area = 0;
+ u_int16_t node = 0;
+ int pos;
+
+ pos = dnet_num(src, &area);
+ if ((pos == 0) || (area > 63) || (*(src + pos) != '.'))
+ return 0;
+ pos = dnet_num(src + pos + 1, &node);
+ if ((pos == 0) || (node > 1023))
+ return 0;
+ dna->a_len = 2;
+ *(u_int16_t *)dna->a_addr = dn_htons((area << 10) | node);
+
+ return 1;
+}
+
+int dnet_pton(int af, const char *src, void *addr)
+{
+ int err;
+
+ switch (af) {
+ case AF_DECnet:
+ errno = 0;
+ err = dnet_pton1(src, (struct dn_naddr *)addr);
+ break;
+ default:
+ errno = EAFNOSUPPORT;
+ err = -1;
+ }
+
+ return err;
+}
diff --git a/lib/inet_ntop.c b/lib/inet_ntop.c
index e69de29b..a3722d67 100644
--- a/lib/inet_ntop.c
+++ b/lib/inet_ntop.c
@@ -0,0 +1,199 @@
+/* Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char rcsid[] = "$Id: inet_ntop.c,v 1.4 1996/09/27 03:24:13 drepper Exp $";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <arpa/nameser.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include <linux/in6.h>
+#ifndef IN6ADDRSZ
+#define IN6ADDRSZ sizeof(struct in6_addr)
+#endif
+
+#ifdef SPRINTF_CHAR
+# define SPRINTF(x) strlen(sprintf/**/x)
+#else
+# define SPRINTF(x) ((size_t)sprintf x)
+#endif
+
+/*
+ * WARNING: Don't even consider trying to compile this on a system where
+ * sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX.
+ */
+
+static const char *inet_ntop4 __P((const u_char *src, char *dst, size_t size));
+static const char *inet_ntop6 __P((const u_char *src, char *dst, size_t size));
+
+/* char *
+ * inet_ntop(af, src, dst, size)
+ * convert a network format address to presentation format.
+ * return:
+ * pointer to presentation format address (`dst'), or NULL (see errno).
+ * author:
+ * Paul Vixie, 1996.
+ */
+const char *
+inet_ntop(af, src, dst, size)
+ int af;
+ const void *src;
+ char *dst;
+ size_t size;
+{
+ switch (af) {
+ case AF_INET:
+ return (inet_ntop4(src, dst, size));
+ case AF_INET6:
+ return (inet_ntop6(src, dst, size));
+ default:
+ errno = (EAFNOSUPPORT);
+ return (NULL);
+ }
+ /* NOTREACHED */
+}
+
+/* const char *
+ * inet_ntop4(src, dst, size)
+ * format an IPv4 address, more or less like inet_ntoa()
+ * return:
+ * `dst' (as a const)
+ * notes:
+ * (1) uses no statics
+ * (2) takes a u_char* not an in_addr as input
+ * author:
+ * Paul Vixie, 1996.
+ */
+static const char *
+inet_ntop4(src, dst, size)
+ const u_char *src;
+ char *dst;
+ size_t size;
+{
+ static const char fmt[] = "%u.%u.%u.%u";
+ char tmp[sizeof "255.255.255.255"];
+
+ if (SPRINTF((tmp, fmt, src[0], src[1], src[2], src[3])) > size) {
+ errno = (ENOSPC);
+ return (NULL);
+ }
+ strcpy(dst, tmp);
+ return (dst);
+}
+
+/* const char *
+ * inet_ntop6(src, dst, size)
+ * convert IPv6 binary address into presentation (printable) format
+ * author:
+ * Paul Vixie, 1996.
+ */
+static const char *
+inet_ntop6(src, dst, size)
+ const u_char *src;
+ char *dst;
+ size_t size;
+{
+ /*
+ * Note that int32_t and int16_t need only be "at least" large enough
+ * to contain a value of the specified size. On some systems, like
+ * Crays, there is no such thing as an integer variable with 16 bits.
+ * Keep this in mind if you think this function should have been coded
+ * to use pointer overlays. All the world's not a VAX.
+ */
+ char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
+ struct { int base, len; } best, cur;
+ u_int words[sizeof(struct in6_addr) / INT16SZ];
+ int i;
+
+ /*
+ * Preprocess:
+ * Copy the input (bytewise) array into a wordwise array.
+ * Find the longest run of 0x00's in src[] for :: shorthanding.
+ */
+ memset(words, '\0', sizeof words);
+ for (i = 0; i < IN6ADDRSZ; i++)
+ words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3));
+ best.base = -1;
+ cur.base = -1;
+ for (i = 0; i < (IN6ADDRSZ / INT16SZ); i++) {
+ if (words[i] == 0) {
+ if (cur.base == -1)
+ cur.base = i, cur.len = 1;
+ else
+ cur.len++;
+ } else {
+ if (cur.base != -1) {
+ if (best.base == -1 || cur.len > best.len)
+ best = cur;
+ cur.base = -1;
+ }
+ }
+ }
+ if (cur.base != -1) {
+ if (best.base == -1 || cur.len > best.len)
+ best = cur;
+ }
+ if (best.base != -1 && best.len < 2)
+ best.base = -1;
+
+ /*
+ * Format the result.
+ */
+ tp = tmp;
+ for (i = 0; i < (IN6ADDRSZ / INT16SZ); i++) {
+ /* Are we inside the best run of 0x00's? */
+ if (best.base != -1 && i >= best.base &&
+ i < (best.base + best.len)) {
+ if (i == best.base)
+ *tp++ = ':';
+ continue;
+ }
+ /* Are we following an initial run of 0x00s or any real hex? */
+ if (i != 0)
+ *tp++ = ':';
+ /* Is this address an encapsulated IPv4? */
+ if (i == 6 && best.base == 0 &&
+ (best.len == 6 || (best.len == 5 && words[5] == 0xffff))) {
+ if (!inet_ntop4(src+12, tp, sizeof tmp - (tp - tmp)))
+ return (NULL);
+ tp += strlen(tp);
+ break;
+ }
+ tp += SPRINTF((tp, "%x", words[i]));
+ }
+ /* Was it a trailing run of 0x00's? */
+ if (best.base != -1 && (best.base + best.len) == (IN6ADDRSZ / INT16SZ))
+ *tp++ = ':';
+ *tp++ = '\0';
+
+ /*
+ * Check for overflow, copy, and we're done.
+ */
+ if ((size_t)(tp - tmp) > size) {
+ errno = (ENOSPC);
+ return (NULL);
+ }
+ strcpy(dst, tmp);
+ return (dst);
+}
diff --git a/lib/inet_proto.c b/lib/inet_proto.c
index e69de29b..a55e0e7b 100644
--- a/lib/inet_proto.c
+++ b/lib/inet_proto.c
@@ -0,0 +1,70 @@
+/*
+ * inet_proto.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <string.h>
+
+#include "utils.h"
+
+char *inet_proto_n2a(int proto, char *buf, int len)
+{
+ static char ncache[16];
+ static int icache = -1;
+ struct protoent *pe;
+
+ if (proto == icache)
+ return ncache;
+
+ pe = getprotobynumber(proto);
+ if (pe) {
+ icache = proto;
+ strncpy(ncache, pe->p_name, 16);
+ strncpy(buf, pe->p_name, len);
+ return buf;
+ }
+ snprintf(buf, len, "ipproto-%d", proto);
+ return buf;
+}
+
+int inet_proto_a2n(char *buf)
+{
+ static char ncache[16];
+ static int icache = -1;
+ struct protoent *pe;
+
+ if (icache>=0 && strcmp(ncache, buf) == 0)
+ return icache;
+
+ if (buf[0] >= '0' && buf[0] <= '9') {
+ __u8 ret;
+ if (get_u8(&ret, buf, 10))
+ return -1;
+ return ret;
+ }
+
+ pe = getprotobyname(buf);
+ if (pe) {
+ icache = pe->p_proto;
+ strncpy(ncache, pe->p_name, 16);
+ return pe->p_proto;
+ }
+ return -1;
+}
+
+
diff --git a/lib/inet_pton.c b/lib/inet_pton.c
index e69de29b..99508344 100644
--- a/lib/inet_pton.c
+++ b/lib/inet_pton.c
@@ -0,0 +1,217 @@
+/* Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char rcsid[] = "$Id: inet_pton.c,v 1.5 1996/09/27 03:24:16 drepper Exp $";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <arpa/nameser.h>
+#include <string.h>
+#include <errno.h>
+
+#include <linux/in6.h>
+#define IN6ADDRSZ sizeof(struct in6_addr)
+
+/*
+ * WARNING: Don't even consider trying to compile this on a system where
+ * sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX.
+ */
+
+static int inet_pton4 __P((const char *src, u_char *dst));
+static int inet_pton6 __P((const char *src, u_char *dst));
+
+/* int
+ * inet_pton(af, src, dst)
+ * convert from presentation format (which usually means ASCII printable)
+ * to network format (which is usually some kind of binary format).
+ * return:
+ * 1 if the address was valid for the specified address family
+ * 0 if the address wasn't valid (`dst' is untouched in this case)
+ * -1 if some other error occurred (`dst' is untouched in this case, too)
+ * author:
+ * Paul Vixie, 1996.
+ */
+int
+inet_pton(af, src, dst)
+ int af;
+ const char *src;
+ void *dst;
+{
+ switch (af) {
+ case AF_INET:
+ return (inet_pton4(src, dst));
+ case AF_INET6:
+ return (inet_pton6(src, dst));
+ default:
+ errno = EAFNOSUPPORT;
+ return (-1);
+ }
+ /* NOTREACHED */
+}
+
+/* int
+ * inet_pton4(src, dst)
+ * like inet_aton() but without all the hexadecimal and shorthand.
+ * return:
+ * 1 if `src' is a valid dotted quad, else 0.
+ * notice:
+ * does not touch `dst' unless it's returning 1.
+ * author:
+ * Paul Vixie, 1996.
+ */
+static int
+inet_pton4(src, dst)
+ const char *src;
+ u_char *dst;
+{
+ static const char digits[] = "0123456789";
+ int saw_digit, octets, ch;
+ u_char tmp[INADDRSZ], *tp;
+
+ saw_digit = 0;
+ octets = 0;
+ *(tp = tmp) = 0;
+ while ((ch = *src++) != '\0') {
+ const char *pch;
+
+ if ((pch = strchr(digits, ch)) != NULL) {
+ u_int new = *tp * 10 + (pch - digits);
+
+ if (new > 255)
+ return (0);
+ *tp = new;
+ if (! saw_digit) {
+ if (++octets > 4)
+ return (0);
+ saw_digit = 1;
+ }
+ } else if (ch == '.' && saw_digit) {
+ if (octets == 4)
+ return (0);
+ *++tp = 0;
+ saw_digit = 0;
+ } else
+ return (0);
+ }
+ if (octets < 4)
+ return (0);
+
+ memcpy(dst, tmp, INADDRSZ);
+ return (1);
+}
+
+/* int
+ * inet_pton6(src, dst)
+ * convert presentation level address to network order binary form.
+ * return:
+ * 1 if `src' is a valid [RFC1884 2.2] address, else 0.
+ * notice:
+ * (1) does not touch `dst' unless it's returning 1.
+ * (2) :: in a full address is silently ignored.
+ * credit:
+ * inspired by Mark Andrews.
+ * author:
+ * Paul Vixie, 1996.
+ */
+static int
+inet_pton6(src, dst)
+ const char *src;
+ u_char *dst;
+{
+ static const char xdigits_l[] = "0123456789abcdef",
+ xdigits_u[] = "0123456789ABCDEF";
+ u_char tmp[IN6ADDRSZ], *tp, *endp, *colonp;
+ const char *xdigits, *curtok;
+ int ch, saw_xdigit;
+ u_int val;
+
+ memset((tp = tmp), '\0', IN6ADDRSZ);
+ endp = tp + IN6ADDRSZ;
+ colonp = NULL;
+ /* Leading :: requires some special handling. */
+ if (*src == ':')
+ if (*++src != ':')
+ return (0);
+ curtok = src;
+ saw_xdigit = 0;
+ val = 0;
+ while ((ch = *src++) != '\0') {
+ const char *pch;
+
+ if ((pch = strchr((xdigits = xdigits_l), ch)) == NULL)
+ pch = strchr((xdigits = xdigits_u), ch);
+ if (pch != NULL) {
+ val <<= 4;
+ val |= (pch - xdigits);
+ if (val > 0xffff)
+ return (0);
+ saw_xdigit = 1;
+ continue;
+ }
+ if (ch == ':') {
+ curtok = src;
+ if (!saw_xdigit) {
+ if (colonp)
+ return (0);
+ colonp = tp;
+ continue;
+ }
+ if (tp + INT16SZ > endp)
+ return (0);
+ *tp++ = (u_char) (val >> 8) & 0xff;
+ *tp++ = (u_char) val & 0xff;
+ saw_xdigit = 0;
+ val = 0;
+ continue;
+ }
+ if (ch == '.' && ((tp + INADDRSZ) <= endp) &&
+ inet_pton4(curtok, tp) > 0) {
+ tp += INADDRSZ;
+ saw_xdigit = 0;
+ break; /* '\0' was seen by inet_pton4(). */
+ }
+ return (0);
+ }
+ if (saw_xdigit) {
+ if (tp + INT16SZ > endp)
+ return (0);
+ *tp++ = (u_char) (val >> 8) & 0xff;
+ *tp++ = (u_char) val & 0xff;
+ }
+ if (colonp != NULL) {
+ /*
+ * Since some memmove()'s erroneously fail to handle
+ * overlapping regions, we'll do the shift by hand.
+ */
+ const int n = tp - colonp;
+ int i;
+
+ for (i = 1; i <= n; i++) {
+ endp[- i] = colonp[n - i];
+ colonp[n - i] = 0;
+ }
+ tp = endp;
+ }
+ if (tp != endp)
+ return (0);
+ memcpy(dst, tmp, IN6ADDRSZ);
+ return (1);
+}
diff --git a/lib/ipx_ntop.c b/lib/ipx_ntop.c
index e69de29b..b2d67902 100644
--- a/lib/ipx_ntop.c
+++ b/lib/ipx_ntop.c
@@ -0,0 +1,71 @@
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include "utils.h"
+
+static __inline__ int do_digit(char *str, u_int32_t addr, u_int32_t scale, size_t *pos, size_t len)
+{
+ u_int32_t tmp = addr >> (scale * 4);
+
+ if (*pos == len)
+ return 1;
+
+ tmp &= 0x0f;
+ if (tmp > 9)
+ *str = tmp + 'A' - 10;
+ else
+ *str = tmp + '0';
+ (*pos)++;
+
+ return 0;
+}
+
+static const char *ipx_ntop1(const struct ipx_addr *addr, char *str, size_t len)
+{
+ int i;
+ size_t pos = 0;
+
+ if (len == 0)
+ return str;
+
+ for(i = 7; i >= 0; i--)
+ if (do_digit(str + pos, ntohl(addr->ipx_net), i, &pos, len))
+ return str;
+
+ if (pos == len)
+ return str;
+
+ *(str + pos) = '.';
+ pos++;
+
+ for(i = 0; i < 6; i++) {
+ if (do_digit(str + pos, addr->ipx_node[i], 1, &pos, len))
+ return str;
+ if (do_digit(str + pos, addr->ipx_node[i], 0, &pos, len))
+ return str;
+ }
+
+ if (pos == len)
+ return str;
+
+ *(str + pos) = 0;
+
+ return str;
+}
+
+
+const char *ipx_ntop(int af, const void *addr, char *str, size_t len)
+{
+ switch(af) {
+ case AF_IPX:
+ errno = 0;
+ return ipx_ntop1((struct ipx_addr *)addr, str, len);
+ default:
+ errno = EAFNOSUPPORT;
+ }
+
+ return NULL;
+}
+
+
diff --git a/lib/ipx_pton.c b/lib/ipx_pton.c
index e69de29b..1a52b7f1 100644
--- a/lib/ipx_pton.c
+++ b/lib/ipx_pton.c
@@ -0,0 +1,107 @@
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+#include "utils.h"
+
+static u_int32_t hexget(char c)
+{
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ if (c >= '0' && c <= '9')
+ return c - '0';
+
+ return 0xf0;
+}
+
+static int ipx_getnet(u_int32_t *net, const char *str)
+{
+ int i;
+ u_int32_t tmp;
+
+ for(i = 0; *str && (i < 8); i++) {
+
+ if ((tmp = hexget(*str)) & 0xf0) {
+ if (*str == '.')
+ return 0;
+ else
+ return -1;
+ }
+
+ str++;
+ (*net) <<= 4;
+ (*net) |= tmp;
+ }
+
+ if (*str == 0)
+ return 0;
+
+ return -1;
+}
+
+static int ipx_getnode(u_int8_t *node, const char *str)
+{
+ int i;
+ u_int32_t tmp;
+
+ for(i = 0; i < 6; i++) {
+ if ((tmp = hexget(*str++)) & 0xf0)
+ return -1;
+ node[i] = (u_int8_t)tmp;
+ node[i] <<= 4;
+ if ((tmp = hexget(*str++)) & 0xf0)
+ return -1;
+ node[i] |= (u_int8_t)tmp;
+ if (*str == ':')
+ str++;
+ }
+
+ return 0;
+}
+
+static int ipx_pton1(const char *src, struct ipx_addr *addr)
+{
+ char *sep = (char *)src;
+ int no_node = 0;
+
+ memset(addr, 0, sizeof(struct ipx_addr));
+
+ while(*sep && (*sep != '.'))
+ sep++;
+
+ if (*sep != '.')
+ no_node = 1;
+
+ if (ipx_getnet(&addr->ipx_net, src))
+ return 0;
+
+ addr->ipx_net = htonl(addr->ipx_net);
+
+ if (no_node)
+ return 1;
+
+ if (ipx_getnode(addr->ipx_node, sep + 1))
+ return 0;
+
+ return 1;
+}
+
+int ipx_pton(int af, const char *src, void *addr)
+{
+ int err;
+
+ switch (af) {
+ case AF_IPX:
+ errno = 0;
+ err = ipx_pton1(src, (struct ipx_addr *)addr);
+ break;
+ default:
+ errno = EAFNOSUPPORT;
+ err = -1;
+ }
+
+ return err;
+}
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index e69de29b..a1f39d40 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -0,0 +1,521 @@
+/*
+ * libnetlink.c RTnetlink service routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <net/if_arp.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/uio.h>
+
+#include "libnetlink.h"
+
+void rtnl_close(struct rtnl_handle *rth)
+{
+ close(rth->fd);
+}
+
+int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions)
+{
+ int addr_len;
+
+ memset(rth, 0, sizeof(rth));
+
+ rth->fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (rth->fd < 0) {
+ perror("Cannot open netlink socket");
+ return -1;
+ }
+
+ memset(&rth->local, 0, sizeof(rth->local));
+ rth->local.nl_family = AF_NETLINK;
+ rth->local.nl_groups = subscriptions;
+
+ if (bind(rth->fd, (struct sockaddr*)&rth->local, sizeof(rth->local)) < 0) {
+ perror("Cannot bind netlink socket");
+ return -1;
+ }
+ addr_len = sizeof(rth->local);
+ if (getsockname(rth->fd, (struct sockaddr*)&rth->local, &addr_len) < 0) {
+ perror("Cannot getsockname");
+ return -1;
+ }
+ if (addr_len != sizeof(rth->local)) {
+ fprintf(stderr, "Wrong address length %d\n", addr_len);
+ return -1;
+ }
+ if (rth->local.nl_family != AF_NETLINK) {
+ fprintf(stderr, "Wrong address family %d\n", rth->local.nl_family);
+ return -1;
+ }
+ rth->seq = time(NULL);
+ return 0;
+}
+
+int rtnl_wilddump_request(struct rtnl_handle *rth, int family, int type)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct rtgenmsg g;
+ } req;
+ struct sockaddr_nl nladdr;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = type;
+ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = rth->dump = ++rth->seq;
+ req.g.rtgen_family = family;
+
+ return sendto(rth->fd, (void*)&req, sizeof(req), 0, (struct sockaddr*)&nladdr, sizeof(nladdr));
+}
+
+int rtnl_send(struct rtnl_handle *rth, char *buf, int len)
+{
+ struct sockaddr_nl nladdr;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ return sendto(rth->fd, buf, len, 0, (struct sockaddr*)&nladdr, sizeof(nladdr));
+}
+
+int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len)
+{
+ struct nlmsghdr nlh;
+ struct sockaddr_nl nladdr;
+ struct iovec iov[2] = { { &nlh, sizeof(nlh) }, { req, len } };
+ struct msghdr msg = {
+ (void*)&nladdr, sizeof(nladdr),
+ iov, 2,
+ NULL, 0,
+ 0
+ };
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ nlh.nlmsg_len = NLMSG_LENGTH(len);
+ nlh.nlmsg_type = type;
+ nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
+ nlh.nlmsg_pid = 0;
+ nlh.nlmsg_seq = rth->dump = ++rth->seq;
+
+ return sendmsg(rth->fd, &msg, 0);
+}
+
+int rtnl_dump_filter(struct rtnl_handle *rth,
+ int (*filter)(struct sockaddr_nl *, struct nlmsghdr *n, void *),
+ void *arg1,
+ int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *arg2)
+{
+ char buf[8192];
+ struct sockaddr_nl nladdr;
+ struct iovec iov = { buf, sizeof(buf) };
+
+ while (1) {
+ int status;
+ struct nlmsghdr *h;
+
+ struct msghdr msg = {
+ (void*)&nladdr, sizeof(nladdr),
+ &iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ status = recvmsg(rth->fd, &msg, 0);
+
+ if (status < 0) {
+ if (errno == EINTR)
+ continue;
+ perror("OVERRUN");
+ continue;
+ }
+ if (status == 0) {
+ fprintf(stderr, "EOF on netlink\n");
+ return -1;
+ }
+ if (msg.msg_namelen != sizeof(nladdr)) {
+ fprintf(stderr, "sender address length == %d\n", msg.msg_namelen);
+ exit(1);
+ }
+
+ h = (struct nlmsghdr*)buf;
+ while (NLMSG_OK(h, status)) {
+ int err;
+
+ if (h->nlmsg_pid != rth->local.nl_pid ||
+ h->nlmsg_seq != rth->dump) {
+ if (junk) {
+ err = junk(&nladdr, h, arg2);
+ if (err < 0)
+ return err;
+ }
+ goto skip_it;
+ }
+
+ if (h->nlmsg_type == NLMSG_DONE)
+ return 0;
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+ if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+ fprintf(stderr, "ERROR truncated\n");
+ } else {
+ errno = -err->error;
+ perror("RTNETLINK answers");
+ }
+ return -1;
+ }
+ err = filter(&nladdr, h, arg1);
+ if (err < 0)
+ return err;
+
+skip_it:
+ h = NLMSG_NEXT(h, status);
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Message truncated\n");
+ continue;
+ }
+ if (status) {
+ fprintf(stderr, "!!!Remnant of size %d\n", status);
+ exit(1);
+ }
+ }
+}
+
+int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer,
+ unsigned groups, struct nlmsghdr *answer,
+ int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *jarg)
+{
+ int status;
+ unsigned seq;
+ struct nlmsghdr *h;
+ struct sockaddr_nl nladdr;
+ struct iovec iov = { (void*)n, n->nlmsg_len };
+ char buf[8192];
+ struct msghdr msg = {
+ (void*)&nladdr, sizeof(nladdr),
+ &iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+ nladdr.nl_pid = peer;
+ nladdr.nl_groups = groups;
+
+ n->nlmsg_seq = seq = ++rtnl->seq;
+ if (answer == NULL)
+ n->nlmsg_flags |= NLM_F_ACK;
+
+ status = sendmsg(rtnl->fd, &msg, 0);
+
+ if (status < 0) {
+ perror("Cannot talk to rtnetlink");
+ return -1;
+ }
+
+ iov.iov_base = buf;
+
+ while (1) {
+ iov.iov_len = sizeof(buf);
+ status = recvmsg(rtnl->fd, &msg, 0);
+
+ if (status < 0) {
+ if (errno == EINTR)
+ continue;
+ perror("OVERRUN");
+ continue;
+ }
+ if (status == 0) {
+ fprintf(stderr, "EOF on netlink\n");
+ return -1;
+ }
+ if (msg.msg_namelen != sizeof(nladdr)) {
+ fprintf(stderr, "sender address length == %d\n", msg.msg_namelen);
+ exit(1);
+ }
+ for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) {
+ int err;
+ int len = h->nlmsg_len;
+ int l = len - sizeof(*h);
+
+ if (l<0 || len>status) {
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Truncated message\n");
+ return -1;
+ }
+ fprintf(stderr, "!!!malformed message: len=%d\n", len);
+ exit(1);
+ }
+
+ if (h->nlmsg_pid != rtnl->local.nl_pid ||
+ h->nlmsg_seq != seq) {
+ if (junk) {
+ err = junk(&nladdr, h, jarg);
+ if (err < 0)
+ return err;
+ }
+ continue;
+ }
+
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+ if (l < sizeof(struct nlmsgerr)) {
+ fprintf(stderr, "ERROR truncated\n");
+ } else {
+ errno = -err->error;
+ if (errno == 0) {
+ if (answer)
+ memcpy(answer, h, h->nlmsg_len);
+ return 0;
+ }
+ perror("RTNETLINK answers");
+ }
+ return -1;
+ }
+ if (answer) {
+ memcpy(answer, h, h->nlmsg_len);
+ return 0;
+ }
+
+ fprintf(stderr, "Unexpected reply!!!\n");
+
+ status -= NLMSG_ALIGN(len);
+ h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Message truncated\n");
+ continue;
+ }
+ if (status) {
+ fprintf(stderr, "!!!Remnant of size %d\n", status);
+ exit(1);
+ }
+ }
+}
+
+int rtnl_listen(struct rtnl_handle *rtnl,
+ int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *jarg)
+{
+ int status;
+ struct nlmsghdr *h;
+ struct sockaddr_nl nladdr;
+ struct iovec iov;
+ char buf[8192];
+ struct msghdr msg = {
+ (void*)&nladdr, sizeof(nladdr),
+ &iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+ nladdr.nl_pid = 0;
+ nladdr.nl_groups = 0;
+
+
+ iov.iov_base = buf;
+
+ while (1) {
+ iov.iov_len = sizeof(buf);
+ status = recvmsg(rtnl->fd, &msg, 0);
+
+ if (status < 0) {
+ if (errno == EINTR)
+ continue;
+ perror("OVERRUN");
+ continue;
+ }
+ if (status == 0) {
+ fprintf(stderr, "EOF on netlink\n");
+ return -1;
+ }
+ if (msg.msg_namelen != sizeof(nladdr)) {
+ fprintf(stderr, "Sender address length == %d\n", msg.msg_namelen);
+ exit(1);
+ }
+ for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) {
+ int err;
+ int len = h->nlmsg_len;
+ int l = len - sizeof(*h);
+
+ if (l<0 || len>status) {
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Truncated message\n");
+ return -1;
+ }
+ fprintf(stderr, "!!!malformed message: len=%d\n", len);
+ exit(1);
+ }
+
+ err = handler(&nladdr, h, jarg);
+ if (err < 0)
+ return err;
+
+ status -= NLMSG_ALIGN(len);
+ h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Message truncated\n");
+ continue;
+ }
+ if (status) {
+ fprintf(stderr, "!!!Remnant of size %d\n", status);
+ exit(1);
+ }
+ }
+}
+
+int rtnl_from_file(FILE *rtnl,
+ int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *),
+ void *jarg)
+{
+ int status;
+ struct sockaddr_nl nladdr;
+ char buf[8192];
+ struct nlmsghdr *h = (void*)buf;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+ nladdr.nl_pid = 0;
+ nladdr.nl_groups = 0;
+
+ while (1) {
+ int err, len, type;
+ int l;
+
+ status = fread(&buf, 1, sizeof(*h), rtnl);
+
+ if (status < 0) {
+ if (errno == EINTR)
+ continue;
+ perror("rtnl_from_file: fread");
+ return -1;
+ }
+ if (status == 0)
+ return 0;
+
+ len = h->nlmsg_len;
+ type= h->nlmsg_type;
+ l = len - sizeof(*h);
+
+ if (l<0 || len>sizeof(buf)) {
+ fprintf(stderr, "!!!malformed message: len=%d @%lu\n",
+ len, ftell(rtnl));
+ return -1;
+ }
+
+ status = fread(NLMSG_DATA(h), 1, NLMSG_ALIGN(l), rtnl);
+
+ if (status < 0) {
+ perror("rtnl_from_file: fread");
+ return -1;
+ }
+ if (status < l) {
+ fprintf(stderr, "rtnl-from_file: truncated message\n");
+ return -1;
+ }
+
+ err = handler(&nladdr, h, jarg);
+ if (err < 0)
+ return err;
+ }
+}
+
+int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data)
+{
+ int len = RTA_LENGTH(4);
+ struct rtattr *rta;
+ if (NLMSG_ALIGN(n->nlmsg_len) + len > maxlen)
+ return -1;
+ rta = (struct rtattr*)(((char*)n) + NLMSG_ALIGN(n->nlmsg_len));
+ rta->rta_type = type;
+ rta->rta_len = len;
+ memcpy(RTA_DATA(rta), &data, 4);
+ n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + len;
+ return 0;
+}
+
+int addattr_l(struct nlmsghdr *n, int maxlen, int type, void *data, int alen)
+{
+ int len = RTA_LENGTH(alen);
+ struct rtattr *rta;
+
+ if (NLMSG_ALIGN(n->nlmsg_len) + len > maxlen)
+ return -1;
+ rta = (struct rtattr*)(((char*)n) + NLMSG_ALIGN(n->nlmsg_len));
+ rta->rta_type = type;
+ rta->rta_len = len;
+ memcpy(RTA_DATA(rta), data, alen);
+ n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + len;
+ return 0;
+}
+
+int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data)
+{
+ int len = RTA_LENGTH(4);
+ struct rtattr *subrta;
+
+ if (RTA_ALIGN(rta->rta_len) + len > maxlen)
+ return -1;
+ subrta = (struct rtattr*)(((char*)rta) + RTA_ALIGN(rta->rta_len));
+ subrta->rta_type = type;
+ subrta->rta_len = len;
+ memcpy(RTA_DATA(subrta), &data, 4);
+ rta->rta_len = NLMSG_ALIGN(rta->rta_len) + len;
+ return 0;
+}
+
+int rta_addattr_l(struct rtattr *rta, int maxlen, int type, void *data, int alen)
+{
+ struct rtattr *subrta;
+ int len = RTA_LENGTH(alen);
+
+ if (RTA_ALIGN(rta->rta_len) + len > maxlen)
+ return -1;
+ subrta = (struct rtattr*)(((char*)rta) + RTA_ALIGN(rta->rta_len));
+ subrta->rta_type = type;
+ subrta->rta_len = len;
+ memcpy(RTA_DATA(subrta), data, alen);
+ rta->rta_len = NLMSG_ALIGN(rta->rta_len) + len;
+ return 0;
+}
+
+
+int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len)
+{
+ while (RTA_OK(rta, len)) {
+ if (rta->rta_type <= max)
+ tb[rta->rta_type] = rta;
+ rta = RTA_NEXT(rta,len);
+ }
+ if (len)
+ fprintf(stderr, "!!!Deficit %d, rta_len=%d\n", len, rta->rta_len);
+ return 0;
+}
diff --git a/lib/ll_addr.c b/lib/ll_addr.c
index e69de29b..082cb3c4 100644
--- a/lib/ll_addr.c
+++ b/lib/ll_addr.c
@@ -0,0 +1,91 @@
+/*
+ * ll_addr.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+
+
+const char *ll_addr_n2a(unsigned char *addr, int alen, int type, char *buf, int blen)
+{
+ int i;
+ int l;
+
+ if (alen == 4 &&
+ (type == ARPHRD_TUNNEL || type == ARPHRD_SIT || type == ARPHRD_IPGRE)) {
+ return inet_ntop(AF_INET, addr, buf, blen);
+ }
+ l = 0;
+ for (i=0; i<alen; i++) {
+ if (i==0) {
+ snprintf(buf+l, blen, "%02x", addr[i]);
+ blen -= 2;
+ l += 2;
+ } else {
+ snprintf(buf+l, blen, ":%02x", addr[i]);
+ blen -= 3;
+ l += 3;
+ }
+ }
+ return buf;
+}
+
+int ll_addr_a2n(unsigned char *lladdr, int len, char *arg)
+{
+ if (strchr(arg, '.')) {
+ inet_prefix pfx;
+ if (get_addr_1(&pfx, arg, AF_INET)) {
+ fprintf(stderr, "\"%s\" is invalid lladdr.\n", arg);
+ return -1;
+ }
+ if (len < 4)
+ return -1;
+ memcpy(lladdr, pfx.data, 4);
+ return 4;
+ } else {
+ int i;
+
+ for (i=0; i<len; i++) {
+ int temp;
+ char *cp = strchr(arg, ':');
+ if (cp) {
+ *cp = 0;
+ cp++;
+ }
+ if (sscanf(arg, "%x", &temp) != 1) {
+ fprintf(stderr, "\"%s\" is invalid lladdr.\n", arg);
+ return -1;
+ }
+ if (temp < 0 || temp > 255) {
+ fprintf(stderr, "\"%s\" is invalid lladdr.\n", arg);
+ return -1;
+ }
+ lladdr[i] = temp;
+ if (!cp)
+ break;
+ arg = cp;
+ }
+ return i+1;
+ }
+}
diff --git a/lib/ll_map.c b/lib/ll_map.c
index e69de29b..e5a95e6a 100644
--- a/lib/ll_map.c
+++ b/lib/ll_map.c
@@ -0,0 +1,169 @@
+/*
+ * ll_map.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <string.h>
+
+#include "libnetlink.h"
+#include "ll_map.h"
+
+struct idxmap
+{
+ struct idxmap * next;
+ int index;
+ int type;
+ int alen;
+ unsigned flags;
+ unsigned char addr[8];
+ char name[16];
+};
+
+static struct idxmap *idxmap[16];
+
+int ll_remember_index(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ int h;
+ struct ifinfomsg *ifi = NLMSG_DATA(n);
+ struct idxmap *im, **imp;
+ struct rtattr *tb[IFLA_MAX+1];
+
+ if (n->nlmsg_type != RTM_NEWLINK)
+ return 0;
+
+ if (n->nlmsg_len < NLMSG_LENGTH(sizeof(ifi)))
+ return -1;
+
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), IFLA_PAYLOAD(n));
+ if (tb[IFLA_IFNAME] == NULL)
+ return 0;
+
+ h = ifi->ifi_index&0xF;
+
+ for (imp=&idxmap[h]; (im=*imp)!=NULL; imp = &im->next)
+ if (im->index == ifi->ifi_index)
+ break;
+
+ if (im == NULL) {
+ im = malloc(sizeof(*im));
+ if (im == NULL)
+ return 0;
+ im->next = *imp;
+ im->index = ifi->ifi_index;
+ *imp = im;
+ }
+
+ im->type = ifi->ifi_type;
+ im->flags = ifi->ifi_flags;
+ if (tb[IFLA_ADDRESS]) {
+ int alen;
+ im->alen = alen = RTA_PAYLOAD(tb[IFLA_ADDRESS]);
+ if (alen > sizeof(im->addr))
+ alen = sizeof(im->addr);
+ memcpy(im->addr, RTA_DATA(tb[IFLA_ADDRESS]), alen);
+ } else {
+ im->alen = 0;
+ memset(im->addr, 0, sizeof(im->addr));
+ }
+ strcpy(im->name, RTA_DATA(tb[IFLA_IFNAME]));
+ return 0;
+}
+
+const char *ll_idx_n2a(int idx, char *buf)
+{
+ struct idxmap *im;
+
+ if (idx == 0)
+ return "*";
+ for (im = idxmap[idx&0xF]; im; im = im->next)
+ if (im->index == idx)
+ return im->name;
+ snprintf(buf, 16, "if%d", idx);
+ return buf;
+}
+
+
+const char *ll_index_to_name(int idx)
+{
+ static char nbuf[16];
+
+ return ll_idx_n2a(idx, nbuf);
+}
+
+int ll_index_to_type(int idx)
+{
+ struct idxmap *im;
+
+ if (idx == 0)
+ return -1;
+ for (im = idxmap[idx&0xF]; im; im = im->next)
+ if (im->index == idx)
+ return im->type;
+ return -1;
+}
+
+unsigned ll_index_to_flags(int idx)
+{
+ struct idxmap *im;
+
+ if (idx == 0)
+ return 0;
+
+ for (im = idxmap[idx&0xF]; im; im = im->next)
+ if (im->index == idx)
+ return im->flags;
+ return 0;
+}
+
+int ll_name_to_index(char *name)
+{
+ static char ncache[16];
+ static int icache;
+ struct idxmap *im;
+ int i;
+
+ if (name == NULL)
+ return 0;
+ if (icache && strcmp(name, ncache) == 0)
+ return icache;
+ for (i=0; i<16; i++) {
+ for (im = idxmap[i]; im; im = im->next) {
+ if (strcmp(im->name, name) == 0) {
+ icache = im->index;
+ strcpy(ncache, name);
+ return im->index;
+ }
+ }
+ }
+ return 0;
+}
+
+int ll_init_map(struct rtnl_handle *rth)
+{
+ if (rtnl_wilddump_request(rth, AF_UNSPEC, RTM_GETLINK) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(rth, ll_remember_index, &idxmap, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+ return 0;
+}
diff --git a/lib/ll_proto.c b/lib/ll_proto.c
index e69de29b..71f149dc 100644
--- a/lib/ll_proto.c
+++ b/lib/ll_proto.c
@@ -0,0 +1,127 @@
+/*
+ * ll_proto.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+
+
+#define __PF(f,n) { ETH_P_##f, #n },
+static struct {
+ int id;
+ char *name;
+} llproto_names[] = {
+__PF(LOOP,loop)
+__PF(PUP,pup)
+#ifdef ETH_P_PUPAT
+__PF(PUPAT,pupat)
+#endif
+__PF(IP,ip)
+__PF(X25,x25)
+__PF(ARP,arp)
+__PF(BPQ,bpq)
+#ifdef ETH_P_IEEEPUP
+__PF(IEEEPUP,ieeepup)
+#endif
+#ifdef ETH_P_IEEEPUPAT
+__PF(IEEEPUPAT,ieeepupat)
+#endif
+__PF(DEC,dec)
+__PF(DNA_DL,dna_dl)
+__PF(DNA_RC,dna_rc)
+__PF(DNA_RT,dna_rt)
+__PF(LAT,lat)
+__PF(DIAG,diag)
+__PF(CUST,cust)
+__PF(SCA,sca)
+__PF(RARP,rarp)
+__PF(ATALK,atalk)
+__PF(AARP,aarp)
+__PF(IPX,ipx)
+__PF(IPV6,ipv6)
+#ifdef ETH_P_PPP_DISC
+__PF(PPP_DISC,ppp_disc)
+#endif
+#ifdef ETH_P_PPP_SES
+__PF(PPP_SES,ppp_ses)
+#endif
+#ifdef ETH_P_ATMMPOA
+__PF(ATMMPOA,atmmpoa)
+#endif
+#ifdef ETH_P_ATMFATE
+__PF(ATMFATE,atmfate)
+#endif
+
+__PF(802_3,802_3)
+__PF(AX25,ax25)
+__PF(ALL,all)
+__PF(802_2,802_2)
+__PF(SNAP,snap)
+__PF(DDCMP,ddcmp)
+__PF(WAN_PPP,wan_ppp)
+__PF(PPP_MP,ppp_mp)
+__PF(LOCALTALK,localtalk)
+__PF(PPPTALK,ppptalk)
+__PF(TR_802_2,tr_802_2)
+__PF(MOBITEX,mobitex)
+__PF(CONTROL,control)
+__PF(IRDA,irda)
+#ifdef ETH_P_ECONET
+__PF(ECONET,econet)
+#endif
+
+{ 0x8100, "802.1Q" },
+{ ETH_P_IP, "ipv4" },
+};
+#undef __PF
+
+
+char * ll_proto_n2a(unsigned short id, char *buf, int len)
+{
+ int i;
+
+ id = ntohs(id);
+
+ for (i=0; i<sizeof(llproto_names)/sizeof(llproto_names[0]); i++) {
+ if (llproto_names[i].id == id)
+ return llproto_names[i].name;
+ }
+ snprintf(buf, len, "[%d]", id);
+ return buf;
+}
+
+int ll_proto_a2n(unsigned short *id, char *buf)
+{
+ int i;
+ for (i=0; i<sizeof(llproto_names)/sizeof(llproto_names[0]); i++) {
+ if (strcasecmp(llproto_names[i].name, buf) == 0) {
+ *id = htons(llproto_names[i].id);
+ return 0;
+ }
+ }
+ if (get_u16(id, buf, 0))
+ return -1;
+ *id = htons(*id);
+ return 0;
+}
diff --git a/lib/ll_types.c b/lib/ll_types.c
index e69de29b..165ecfa6 100644
--- a/lib/ll_types.c
+++ b/lib/ll_types.c
@@ -0,0 +1,128 @@
+/*
+ * ll_types.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+char * ll_type_n2a(int type, char *buf, int len)
+{
+#define __PF(f,n) { ARPHRD_##f, #n },
+static struct {
+ int type;
+ char *name;
+} arphrd_names[] = {
+{ 0, "generic" },
+__PF(ETHER,ether)
+__PF(EETHER,eether)
+__PF(AX25,ax25)
+__PF(PRONET,pronet)
+__PF(CHAOS,chaos)
+#ifdef ARPHRD_IEEE802_TR
+__PF(IEEE802,ieee802)
+#else
+__PF(IEEE802,tr)
+#endif
+__PF(ARCNET,arcnet)
+__PF(APPLETLK,atalk)
+__PF(DLCI,dlci)
+#ifdef ARPHRD_ATM
+__PF(ATM,atm)
+#endif
+__PF(METRICOM,metricom)
+#ifdef ARPHRD_IEEE1394
+__PF(IEEE1394,ieee1394)
+#endif
+
+__PF(SLIP,slip)
+__PF(CSLIP,cslip)
+__PF(SLIP6,slip6)
+__PF(CSLIP6,cslip6)
+__PF(RSRVD,rsrvd)
+__PF(ADAPT,adapt)
+__PF(ROSE,rose)
+__PF(X25,x25)
+#ifdef ARPHRD_HWX25
+__PF(HWX25,hwx25)
+#endif
+__PF(PPP,ppp)
+__PF(HDLC,hdlc)
+__PF(LAPB,lapb)
+#ifdef ARPHRD_DDCMP
+__PF(DDCMP,ddcmp)
+#endif
+#ifdef ARPHRD_RAWHDLC
+__PF(RAWHDLC,rawhdlc)
+#endif
+
+__PF(TUNNEL,ipip)
+__PF(TUNNEL6,tunnel6)
+__PF(FRAD,frad)
+__PF(SKIP,skip)
+__PF(LOOPBACK,loopback)
+__PF(LOCALTLK,ltalk)
+__PF(FDDI,fddi)
+__PF(BIF,bif)
+__PF(SIT,sit)
+__PF(IPDDP,ip/ddp)
+__PF(IPGRE,gre)
+__PF(PIMREG,pimreg)
+__PF(HIPPI,hippi)
+__PF(ASH,ash)
+__PF(ECONET,econet)
+__PF(IRDA,irda)
+__PF(FCPP,fcpp)
+__PF(FCAL,fcal)
+__PF(FCPL,fcpl)
+__PF(FCFABRIC,fcfb0)
+__PF(FCFABRIC+1,fcfb1)
+__PF(FCFABRIC+2,fcfb2)
+__PF(FCFABRIC+3,fcfb3)
+__PF(FCFABRIC+4,fcfb4)
+__PF(FCFABRIC+5,fcfb5)
+__PF(FCFABRIC+6,fcfb6)
+__PF(FCFABRIC+7,fcfb7)
+__PF(FCFABRIC+8,fcfb8)
+__PF(FCFABRIC+9,fcfb9)
+__PF(FCFABRIC+10,fcfb10)
+__PF(FCFABRIC+11,fcfb11)
+__PF(FCFABRIC+12,fcfb12)
+#ifdef ARPHRD_IEEE802_TR
+__PF(IEEE802_TR,tr)
+#endif
+#ifdef ARPHRD_IEEE80211
+__PF(IEEE80211,ieee802.11)
+#endif
+#ifdef ARPHRD_VOID
+__PF(VOID,void)
+#endif
+};
+#undef __PF
+
+ int i;
+ for (i=0; i<sizeof(arphrd_names)/sizeof(arphrd_names[0]); i++) {
+ if (arphrd_names[i].type == type)
+ return arphrd_names[i].name;
+ }
+ snprintf(buf, len, "[%d]", type);
+ return buf;
+}
diff --git a/lib/rt_names.c b/lib/rt_names.c
index e69de29b..429f73e9 100644
--- a/lib/rt_names.c
+++ b/lib/rt_names.c
@@ -0,0 +1,388 @@
+/*
+ * rt_names.c rtnetlink names DB.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+
+static void rtnl_tab_initialize(char *file, char **tab, int size)
+{
+ char buf[512];
+ FILE *fp;
+
+ fp = fopen(file, "r");
+ if (!fp)
+ return;
+ while (fgets(buf, sizeof(buf), fp)) {
+ char *p = buf;
+ int id;
+ char namebuf[512];
+
+ while (*p == ' ' || *p == '\t')
+ p++;
+ if (*p == '#' || *p == '\n' || *p == 0)
+ continue;
+ if (sscanf(p, "0x%x %s\n", &id, namebuf) != 2 &&
+ sscanf(p, "0x%x %s #", &id, namebuf) != 2 &&
+ sscanf(p, "%d %s\n", &id, namebuf) != 2 &&
+ sscanf(p, "%d %s #", &id, namebuf) != 2) {
+ fprintf(stderr, "Database %s is corrupted at %s\n",
+ file, p);
+ return;
+ }
+
+ if (id<0 || id>size)
+ continue;
+
+ tab[id] = strdup(namebuf);
+ }
+ fclose(fp);
+}
+
+
+static char * rtnl_rtprot_tab[256] = {
+ "none",
+ "redirect",
+ "kernel",
+ "boot",
+ "static",
+ NULL,
+ NULL,
+ NULL,
+ "gated",
+ "ra",
+ "mrt",
+ "zebra",
+ "bird",
+};
+
+
+
+static int rtnl_rtprot_init;
+
+static void rtnl_rtprot_initialize(void)
+{
+ rtnl_rtprot_init = 1;
+ rtnl_tab_initialize("/etc/iproute2/rt_protos",
+ rtnl_rtprot_tab, 256);
+}
+
+char * rtnl_rtprot_n2a(int id, char *buf, int len)
+{
+ if (id<0 || id>=256) {
+ snprintf(buf, len, "%d", id);
+ return buf;
+ }
+ if (!rtnl_rtprot_tab[id]) {
+ if (!rtnl_rtprot_init)
+ rtnl_rtprot_initialize();
+ }
+ if (rtnl_rtprot_tab[id])
+ return rtnl_rtprot_tab[id];
+ snprintf(buf, len, "%d", id);
+ return buf;
+}
+
+int rtnl_rtprot_a2n(__u32 *id, char *arg)
+{
+ static char *cache = NULL;
+ static unsigned long res;
+ char *end;
+ int i;
+
+ if (cache && strcmp(cache, arg) == 0) {
+ *id = res;
+ return 0;
+ }
+
+ if (!rtnl_rtprot_init)
+ rtnl_rtprot_initialize();
+
+ for (i=0; i<256; i++) {
+ if (rtnl_rtprot_tab[i] &&
+ strcmp(rtnl_rtprot_tab[i], arg) == 0) {
+ cache = rtnl_rtprot_tab[i];
+ res = i;
+ *id = res;
+ return 0;
+ }
+ }
+
+ res = strtoul(arg, &end, 0);
+ if (!end || end == arg || *end || res > 255)
+ return -1;
+ *id = res;
+ return 0;
+}
+
+
+
+static char * rtnl_rtscope_tab[256] = {
+ "global",
+};
+
+static int rtnl_rtscope_init;
+
+static void rtnl_rtscope_initialize(void)
+{
+ rtnl_rtscope_init = 1;
+ rtnl_rtscope_tab[255] = "nowhere";
+ rtnl_rtscope_tab[254] = "host";
+ rtnl_rtscope_tab[253] = "link";
+ rtnl_rtscope_tab[200] = "site";
+ rtnl_tab_initialize("/etc/iproute2/rt_scopes",
+ rtnl_rtscope_tab, 256);
+}
+
+char * rtnl_rtscope_n2a(int id, char *buf, int len)
+{
+ if (id<0 || id>=256) {
+ snprintf(buf, len, "%d", id);
+ return buf;
+ }
+ if (!rtnl_rtscope_tab[id]) {
+ if (!rtnl_rtscope_init)
+ rtnl_rtscope_initialize();
+ }
+ if (rtnl_rtscope_tab[id])
+ return rtnl_rtscope_tab[id];
+ snprintf(buf, len, "%d", id);
+ return buf;
+}
+
+int rtnl_rtscope_a2n(__u32 *id, char *arg)
+{
+ static char *cache = NULL;
+ static unsigned long res;
+ char *end;
+ int i;
+
+ if (cache && strcmp(cache, arg) == 0) {
+ *id = res;
+ return 0;
+ }
+
+ if (!rtnl_rtscope_init)
+ rtnl_rtscope_initialize();
+
+ for (i=0; i<256; i++) {
+ if (rtnl_rtscope_tab[i] &&
+ strcmp(rtnl_rtscope_tab[i], arg) == 0) {
+ cache = rtnl_rtscope_tab[i];
+ res = i;
+ *id = res;
+ return 0;
+ }
+ }
+
+ res = strtoul(arg, &end, 0);
+ if (!end || end == arg || *end || res > 255)
+ return -1;
+ *id = res;
+ return 0;
+}
+
+
+
+static char * rtnl_rtrealm_tab[256] = {
+ "unknown",
+};
+
+static int rtnl_rtrealm_init;
+
+static void rtnl_rtrealm_initialize(void)
+{
+ rtnl_rtrealm_init = 1;
+ rtnl_tab_initialize("/etc/iproute2/rt_realms",
+ rtnl_rtrealm_tab, 256);
+}
+
+char * rtnl_rtrealm_n2a(int id, char *buf, int len)
+{
+ if (id<0 || id>=256) {
+ snprintf(buf, len, "%d", id);
+ return buf;
+ }
+ if (!rtnl_rtrealm_tab[id]) {
+ if (!rtnl_rtrealm_init)
+ rtnl_rtrealm_initialize();
+ }
+ if (rtnl_rtrealm_tab[id])
+ return rtnl_rtrealm_tab[id];
+ snprintf(buf, len, "%d", id);
+ return buf;
+}
+
+
+int rtnl_rtrealm_a2n(__u32 *id, char *arg)
+{
+ static char *cache = NULL;
+ static unsigned long res;
+ char *end;
+ int i;
+
+ if (cache && strcmp(cache, arg) == 0) {
+ *id = res;
+ return 0;
+ }
+
+ if (!rtnl_rtrealm_init)
+ rtnl_rtrealm_initialize();
+
+ for (i=0; i<256; i++) {
+ if (rtnl_rtrealm_tab[i] &&
+ strcmp(rtnl_rtrealm_tab[i], arg) == 0) {
+ cache = rtnl_rtrealm_tab[i];
+ res = i;
+ *id = res;
+ return 0;
+ }
+ }
+
+ res = strtoul(arg, &end, 0);
+ if (!end || end == arg || *end || res > 255)
+ return -1;
+ *id = res;
+ return 0;
+}
+
+
+
+static char * rtnl_rttable_tab[256] = {
+ "unspec",
+};
+
+static int rtnl_rttable_init;
+
+static void rtnl_rttable_initialize(void)
+{
+ rtnl_rttable_init = 1;
+ rtnl_rttable_tab[255] = "local";
+ rtnl_rttable_tab[254] = "main";
+ rtnl_tab_initialize("/etc/iproute2/rt_tables",
+ rtnl_rttable_tab, 256);
+}
+
+char * rtnl_rttable_n2a(int id, char *buf, int len)
+{
+ if (id<0 || id>=256) {
+ snprintf(buf, len, "%d", id);
+ return buf;
+ }
+ if (!rtnl_rttable_tab[id]) {
+ if (!rtnl_rttable_init)
+ rtnl_rttable_initialize();
+ }
+ if (rtnl_rttable_tab[id])
+ return rtnl_rttable_tab[id];
+ snprintf(buf, len, "%d", id);
+ return buf;
+}
+
+int rtnl_rttable_a2n(__u32 *id, char *arg)
+{
+ static char *cache = NULL;
+ static unsigned long res;
+ char *end;
+ int i;
+
+ if (cache && strcmp(cache, arg) == 0) {
+ *id = res;
+ return 0;
+ }
+
+ if (!rtnl_rttable_init)
+ rtnl_rttable_initialize();
+
+ for (i=0; i<256; i++) {
+ if (rtnl_rttable_tab[i] &&
+ strcmp(rtnl_rttable_tab[i], arg) == 0) {
+ cache = rtnl_rttable_tab[i];
+ res = i;
+ *id = res;
+ return 0;
+ }
+ }
+
+ i = strtoul(arg, &end, 0);
+ if (!end || end == arg || *end || i > 255)
+ return -1;
+ *id = i;
+ return 0;
+}
+
+
+static char * rtnl_rtdsfield_tab[256] = {
+ "0",
+};
+
+static int rtnl_rtdsfield_init;
+
+static void rtnl_rtdsfield_initialize(void)
+{
+ rtnl_rtdsfield_init = 1;
+ rtnl_tab_initialize("/etc/iproute2/rt_dsfield",
+ rtnl_rtdsfield_tab, 256);
+}
+
+char * rtnl_dsfield_n2a(int id, char *buf, int len)
+{
+ if (id<0 || id>=256) {
+ snprintf(buf, len, "%d", id);
+ return buf;
+ }
+ if (!rtnl_rtdsfield_tab[id]) {
+ if (!rtnl_rtdsfield_init)
+ rtnl_rtdsfield_initialize();
+ }
+ if (rtnl_rtdsfield_tab[id])
+ return rtnl_rtdsfield_tab[id];
+ snprintf(buf, len, "0x%02x", id);
+ return buf;
+}
+
+
+int rtnl_dsfield_a2n(__u32 *id, char *arg)
+{
+ static char *cache = NULL;
+ static unsigned long res;
+ char *end;
+ int i;
+
+ if (cache && strcmp(cache, arg) == 0) {
+ *id = res;
+ return 0;
+ }
+
+ if (!rtnl_rtdsfield_init)
+ rtnl_rtdsfield_initialize();
+
+ for (i=0; i<256; i++) {
+ if (rtnl_rtdsfield_tab[i] &&
+ strcmp(rtnl_rtdsfield_tab[i], arg) == 0) {
+ cache = rtnl_rtdsfield_tab[i];
+ res = i;
+ *id = res;
+ return 0;
+ }
+ }
+
+ res = strtoul(arg, &end, 16);
+ if (!end || end == arg || *end || res > 255)
+ return -1;
+ *id = res;
+ return 0;
+}
+
diff --git a/lib/utils.c b/lib/utils.c
index e69de29b..6763be2a 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -0,0 +1,528 @@
+/*
+ * utils.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *
+ * Changes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <string.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <resolv.h>
+#include <linux/pkt_sched.h>
+
+#include "utils.h"
+
+int get_integer(int *val, char *arg, int base)
+{
+ long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtol(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > INT_MAX || res < INT_MIN)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_unsigned(unsigned *val, char *arg, int base)
+{
+ unsigned long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtoul(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > UINT_MAX)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_u32(__u32 *val, char *arg, int base)
+{
+ unsigned long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtoul(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > 0xFFFFFFFFUL)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_u16(__u16 *val, char *arg, int base)
+{
+ unsigned long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtoul(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > 0xFFFF)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_u8(__u8 *val, char *arg, int base)
+{
+ unsigned long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtoul(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > 0xFF)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_s16(__s16 *val, char *arg, int base)
+{
+ long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtol(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > 0x7FFF || res < -0x8000)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_s8(__s8 *val, char *arg, int base)
+{
+ long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtol(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > 0x7F || res < -0x80)
+ return -1;
+ *val = res;
+ return 0;
+}
+
+int get_addr_1(inet_prefix *addr, char *name, int family)
+{
+ char *cp;
+ unsigned char *ap = (unsigned char*)addr->data;
+ int i;
+
+ memset(addr, 0, sizeof(*addr));
+
+ if (strcmp(name, "default") == 0 ||
+ strcmp(name, "all") == 0 ||
+ strcmp(name, "any") == 0) {
+ if (family == AF_DECnet)
+ return -1;
+ addr->family = family;
+ addr->bytelen = (family == AF_INET6 ? 16 : 4);
+ addr->bitlen = -1;
+ return 0;
+ }
+
+ if (strchr(name, ':')) {
+ addr->family = AF_INET6;
+ if (family != AF_UNSPEC && family != AF_INET6)
+ return -1;
+ if (inet_pton(AF_INET6, name, addr->data) <= 0)
+ return -1;
+ addr->bytelen = 16;
+ addr->bitlen = -1;
+ return 0;
+ }
+
+ if (family == AF_DECnet) {
+ struct dn_naddr dna;
+ addr->family = AF_DECnet;
+ if (dnet_pton(AF_DECnet, name, &dna) <= 0)
+ return -1;
+ memcpy(addr->data, dna.a_addr, 2);
+ addr->bytelen = 2;
+ addr->bitlen = -1;
+ return 0;
+ }
+
+ addr->family = AF_INET;
+ if (family != AF_UNSPEC && family != AF_INET)
+ return -1;
+ addr->bytelen = 4;
+ addr->bitlen = -1;
+ for (cp=name, i=0; *cp; cp++) {
+ if (*cp <= '9' && *cp >= '0') {
+ ap[i] = 10*ap[i] + (*cp-'0');
+ continue;
+ }
+ if (*cp == '.' && ++i <= 3)
+ continue;
+ return -1;
+ }
+ return 0;
+}
+
+int get_prefix_1(inet_prefix *dst, char *arg, int family)
+{
+ int err;
+ unsigned plen;
+ char *slash;
+
+ memset(dst, 0, sizeof(*dst));
+
+ if (strcmp(arg, "default") == 0 ||
+ strcmp(arg, "any") == 0 ||
+ strcmp(arg, "all") == 0) {
+ if (family == AF_DECnet)
+ return -1;
+ dst->family = family;
+ dst->bytelen = 0;
+ dst->bitlen = 0;
+ return 0;
+ }
+
+ slash = strchr(arg, '/');
+ if (slash)
+ *slash = 0;
+ err = get_addr_1(dst, arg, family);
+ if (err == 0) {
+ switch(dst->family) {
+ case AF_INET6:
+ dst->bitlen = 128;
+ break;
+ case AF_DECnet:
+ dst->bitlen = 16;
+ break;
+ default:
+ case AF_INET:
+ dst->bitlen = 32;
+ }
+ if (slash) {
+ if (get_integer(&plen, slash+1, 0) || plen > dst->bitlen) {
+ err = -1;
+ goto done;
+ }
+ dst->bitlen = plen;
+ }
+ }
+done:
+ if (slash)
+ *slash = '/';
+ return err;
+}
+
+int get_addr(inet_prefix *dst, char *arg, int family)
+{
+ if (family == AF_PACKET) {
+ fprintf(stderr, "Error: \"%s\" may be inet address, but it is not allowed in this context.\n", arg);
+ exit(1);
+ }
+ if (get_addr_1(dst, arg, family)) {
+ fprintf(stderr, "Error: an inet address is expected rather than \"%s\".\n", arg);
+ exit(1);
+ }
+ return 0;
+}
+
+int get_prefix(inet_prefix *dst, char *arg, int family)
+{
+ if (family == AF_PACKET) {
+ fprintf(stderr, "Error: \"%s\" may be inet prefix, but it is not allowed in this context.\n", arg);
+ exit(1);
+ }
+ if (get_prefix_1(dst, arg, family)) {
+ fprintf(stderr, "Error: an inet prefix is expected rather than \"%s\".\n", arg);
+ exit(1);
+ }
+ return 0;
+}
+
+__u32 get_addr32(char *name)
+{
+ inet_prefix addr;
+ if (get_addr_1(&addr, name, AF_INET)) {
+ fprintf(stderr, "Error: an IP address is expected rather than \"%s\"\n", name);
+ exit(1);
+ }
+ return addr.data[0];
+}
+
+void incomplete_command()
+{
+ fprintf(stderr, "Command line is not complete. Try option \"help\"\n");
+ exit(-1);
+}
+
+void invarg(char *msg, char *arg)
+{
+ fprintf(stderr, "Error: argument \"%s\" is wrong: %s\n", arg, msg);
+ exit(-1);
+}
+
+void duparg(char *key, char *arg)
+{
+ fprintf(stderr, "Error: duplicate \"%s\": \"%s\" is the second value.\n", key, arg);
+ exit(-1);
+}
+
+void duparg2(char *key, char *arg)
+{
+ fprintf(stderr, "Error: either \"%s\" is duplicate, or \"%s\" is a garbage.\n", key, arg);
+ exit(-1);
+}
+
+int matches(char *cmd, char *pattern)
+{
+ int len = strlen(cmd);
+ if (len > strlen(pattern))
+ return -1;
+ return memcmp(pattern, cmd, len);
+}
+
+int inet_addr_match(inet_prefix *a, inet_prefix *b, int bits)
+{
+ __u32 *a1 = a->data;
+ __u32 *a2 = b->data;
+ int words = bits >> 0x05;
+
+ bits &= 0x1f;
+
+ if (words)
+ if (memcmp(a1, a2, words << 2))
+ return -1;
+
+ if (bits) {
+ __u32 w1, w2;
+ __u32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (0x20 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 1;
+ }
+
+ return 0;
+}
+
+int __iproute2_hz_internal;
+
+int __get_hz(void)
+{
+ char name[1024];
+ int hz = 0;
+ FILE *fp;
+
+ if (getenv("HZ"))
+ return atoi(getenv("HZ")) ? : HZ;
+
+ if (getenv("PROC_NET_PSCHED")) {
+ snprintf(name, sizeof(name)-1, "%s", getenv("PROC_NET_PSCHED"));
+ } else if (getenv("PROC_ROOT")) {
+ snprintf(name, sizeof(name)-1, "%s/net/psched", getenv("PROC_ROOT"));
+ } else {
+ strcpy(name, "/proc/net/psched");
+ }
+ fp = fopen(name, "r");
+
+ if (fp) {
+ unsigned nom, denom;
+ if (fscanf(fp, "%*08x%*08x%08x%08x", &nom, &denom) == 2)
+ if (nom == 1000000)
+ hz = denom;
+ fclose(fp);
+ }
+ if (hz)
+ return hz;
+ return HZ;
+}
+
+const char *rt_addr_n2a(int af, int len, void *addr, char *buf, int buflen)
+{
+ switch (af) {
+ case AF_INET:
+ case AF_INET6:
+ return inet_ntop(af, addr, buf, buflen);
+ case AF_IPX:
+ return ipx_ntop(af, addr, buf, buflen);
+ case AF_DECnet:
+ {
+ struct dn_naddr dna = { 2, { 0, 0, }};
+ memcpy(dna.a_addr, addr, 2);
+ return dnet_ntop(af, &dna, buf, buflen);
+ }
+ default:
+ return "???";
+ }
+}
+
+#ifdef RESOLVE_HOSTNAMES
+struct namerec
+{
+ struct namerec *next;
+ inet_prefix addr;
+ char *name;
+};
+
+static struct namerec *nht[256];
+
+char *resolve_address(char *addr, int len, int af)
+{
+ struct namerec *n;
+ struct hostent *h_ent;
+ unsigned hash;
+ static int notfirst;
+
+
+ if (af == AF_INET6 && ((__u32*)addr)[0] == 0 &&
+ ((__u32*)addr)[1] == 0 && ((__u32*)addr)[2] == htonl(0xffff)) {
+ af = AF_INET;
+ addr += 12;
+ len = 4;
+ }
+
+ hash = addr[len-1] ^ addr[len-2] ^ addr[len-3] ^ addr[len-4];
+
+ for (n = nht[hash]; n; n = n->next) {
+ if (n->addr.family == af &&
+ n->addr.bytelen == len &&
+ memcmp(n->addr.data, addr, len) == 0)
+ return n->name;
+ }
+ if ((n = malloc(sizeof(*n))) == NULL)
+ return NULL;
+ n->addr.family = af;
+ n->addr.bytelen = len;
+ n->name = NULL;
+ memcpy(n->addr.data, addr, len);
+ n->next = nht[hash];
+ nht[hash] = n;
+ if (++notfirst == 1)
+ sethostent(1);
+ fflush(stdout);
+
+ if ((h_ent = gethostbyaddr(addr, len, af)) != NULL)
+ n->name = strdup(h_ent->h_name);
+
+ /* Even if we fail, "negative" entry is remembered. */
+ return n->name;
+}
+#endif
+
+
+const char *format_host(int af, int len, void *addr, char *buf, int buflen)
+{
+#ifdef RESOLVE_HOSTNAMES
+ if (resolve_hosts) {
+ char *n;
+ if (len <= 0) {
+ switch (af) {
+ case AF_INET:
+ len = 4;
+ break;
+ case AF_INET6:
+ len = 16;
+ break;
+ case AF_IPX:
+ len = 10;
+ break;
+#ifdef AF_DECnet
+ /* I see no reasons why gethostbyname
+ may not work for DECnet */
+ case AF_DECnet:
+ len = 2;
+ break;
+#endif
+ default: ;
+ }
+ }
+ if (len > 0 &&
+ (n = resolve_address(addr, len, af)) != NULL)
+ return n;
+ }
+#endif
+ return rt_addr_n2a(af, len, addr, buf, buflen);
+}
+
+
+__u8* hexstring_n2a(const __u8 *str, int len, __u8 *buf, int blen)
+{
+ __u8 *ptr = buf;
+ int i;
+
+ for (i=0; i<len; i++) {
+ if (blen < 3)
+ break;
+ sprintf(ptr, "%02x", str[i]);
+ ptr += 2;
+ blen -= 2;
+ if (i != len-1 && blen > 1) {
+ *ptr++ = ':';
+ blen--;
+ }
+ }
+ return buf;
+}
+
+__u8* hexstring_a2n(const __u8 *str, __u8 *buf, int blen)
+{
+ int cnt = 0;
+
+ for (;;) {
+ unsigned acc;
+ char ch;
+
+ acc = 0;
+
+ while ((ch = *str) != ':' && ch != 0) {
+ if (ch >= '0' && ch <= '9')
+ ch -= '0';
+ else if (ch >= 'a' && ch <= 'f')
+ ch -= 'a'-10;
+ else if (ch >= 'A' && ch <= 'F')
+ ch -= 'A'-10;
+ else
+ return NULL;
+ acc = (acc<<4) + ch;
+ str++;
+ }
+
+ if (acc > 255)
+ return NULL;
+ if (cnt < blen) {
+ buf[cnt] = acc;
+ cnt++;
+ }
+ if (ch == 0)
+ break;
+ ++str;
+ }
+ if (cnt < blen)
+ memset(buf+cnt, 0, blen-cnt);
+ return buf;
+}
diff --git a/misc/Makefile b/misc/Makefile
index e69de29b..685b0044 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -0,0 +1,37 @@
+SSOBJ=ss.o ssfilter.o
+NSTATOBJ=nstat.o
+IFSTATOBJ=ifstat.o
+RTACCTOBJ=rtacct.o
+ARPDOBJ=arpd.o
+RTSTATOBJ=rtstat.o
+
+ALLOBJ=$(SSOBJ) $(NSTATOBJ) $(IFSTATOBJ) $(RTACCTOBJ) $(ARPDOBJ) $(RTSTATOBJ)
+TARGETS=ss nstat ifstat rtacct arpd rtstat
+
+all: $(TARGETS)
+
+ss: $(SSOBJ) $(LIBUTIL)
+
+nstat: $(NSTATOBJ)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o nstat $(NSTATOBJ) -lm
+
+ifstat: $(IFSTATOBJ)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o ifstat $(IFSTATOBJ) $(LIBNETLINK) -lm
+
+rtacct: $(RTACCTOBJ)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o rtacct $(RTACCTOBJ) $(LIBNETLINK) -lm
+
+arpd: $(ARPDOBJ)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o arpd $(ARPDOBJ) $(LIBNETLINK) -ldb
+
+rtstat: $(RTSTATOBJ)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o rtstat $(RTSTATOBJ)
+
+ssfilter.c: ssfilter.y
+ bison ssfilter.y -o ssfilter.c
+
+install: all
+ install -m 0755 -s $(TARGETS) $(DESTDIR)$(SBINDIR)
+
+clean:
+ rm -f $(ALLOBJ) $(TARGETS) ssfilter.c
diff --git a/misc/arpd.c b/misc/arpd.c
index e69de29b..4590dafc 100644
--- a/misc/arpd.c
+++ b/misc/arpd.c
@@ -0,0 +1,846 @@
+/*
+ * arpd.c ARP helper daemon.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <syslog.h>
+#include <malloc.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <netdb.h>
+#include <db.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/uio.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <linux/if_packet.h>
+#include <linux/filter.h>
+
+#include "libnetlink.h"
+#include "utils.h"
+
+int resolve_hosts;
+
+DB *dbase;
+char *dbname = "/var/lib/arpd/arpd.db";
+
+int ifnum;
+int *ifvec;
+char **ifnames;
+
+struct dbkey
+{
+ __u32 iface;
+ __u32 addr;
+};
+
+#define IS_NEG(x) (((__u8*)(x))[0] == 0xFF)
+#define NEG_TIME(x) (((x)[2]<<24)|((x)[3]<<16)|((x)[4]<<8)|(x)[5])
+#define NEG_AGE(x) ((__u32)time(NULL) - NEG_TIME((__u8*)x))
+#define NEG_VALID(x) (NEG_AGE(x) < negative_timeout)
+#define NEG_CNT(x) (((__u8*)(x))[1])
+
+struct rtnl_handle rth;
+
+struct pollfd pset[2];
+int udp_sock = -1;
+
+volatile int do_exit;
+volatile int do_sync;
+volatile int do_stats;
+
+struct {
+ unsigned long arp_new;
+ unsigned long arp_change;
+
+ unsigned long app_recv;
+ unsigned long app_success;
+ unsigned long app_bad;
+ unsigned long app_neg;
+ unsigned long app_suppressed;
+
+ unsigned long kern_neg;
+ unsigned long kern_new;
+ unsigned long kern_change;
+
+ unsigned long probes_sent;
+ unsigned long probes_suppressed;
+} stats;
+
+int active_probing;
+int negative_timeout = 60;
+int no_kernel_broadcasts;
+int broadcast_rate = 1000;
+int broadcast_burst = 3000;
+
+void usage(void)
+{
+ fprintf(stderr,
+"Usage: arpd [ -lk ] [ -a N ] [ -b dbase ] [ -f file ] [ interfaces ]\n");
+ exit(1);
+}
+
+int handle_if(int ifindex)
+{
+ int i;
+
+ if (ifnum == 0)
+ return 1;
+
+ for (i=0; i<ifnum; i++)
+ if (ifvec[i] == ifindex)
+ return 1;
+ return 0;
+}
+
+int sysctl_adjusted;
+
+void do_sysctl_adjustments(void)
+{
+ int i;
+
+ if (!ifnum)
+ return;
+
+ for (i=0; i<ifnum; i++) {
+ char buf[128];
+ FILE *fp;
+
+ if (active_probing) {
+ sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/mcast_solicit", ifnames[i]);
+ if ((fp = fopen(buf, "w")) != NULL) {
+ if (no_kernel_broadcasts)
+ strcpy(buf, "0\n");
+ else
+ sprintf(buf, "%d\n", active_probing>=2 ? 1 : 3-active_probing);
+ fputs(buf, fp);
+ fclose(fp);
+ }
+ }
+
+ sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/app_solicit", ifnames[i]);
+ if ((fp = fopen(buf, "w")) != NULL) {
+ sprintf(buf, "%d\n", active_probing<=1 ? 1 : active_probing);
+ fputs(buf, fp);
+ fclose(fp);
+ }
+ }
+ sysctl_adjusted = 1;
+}
+
+void undo_sysctl_adjustments(void)
+{
+ int i;
+
+ if (!sysctl_adjusted)
+ return;
+
+ for (i=0; i<ifnum; i++) {
+ char buf[128];
+ FILE *fp;
+
+ if (active_probing) {
+ sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/mcast_solicit", ifnames[i]);
+ if ((fp = fopen(buf, "w")) != NULL) {
+ strcpy(buf, "3\n");
+ fputs(buf, fp);
+ fclose(fp);
+ }
+ }
+ sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/app_solicit", ifnames[i]);
+ if ((fp = fopen(buf, "w")) != NULL) {
+ strcpy(buf, "0\n");
+ fputs(buf, fp);
+ fclose(fp);
+ }
+ }
+ sysctl_adjusted = 0;
+}
+
+
+int send_probe(int ifindex, __u32 addr)
+{
+ struct ifreq ifr;
+ struct sockaddr_in dst;
+ int len;
+ unsigned char buf[256];
+ struct arphdr *ah = (struct arphdr*)buf;
+ unsigned char *p = (unsigned char *)(ah+1);
+ struct sockaddr_ll sll;
+
+ memset(&ifr, 0, sizeof(ifr));
+ ifr.ifr_ifindex = ifindex;
+ if (ioctl(udp_sock, SIOCGIFNAME, &ifr))
+ return -1;
+ if (ioctl(udp_sock, SIOCGIFHWADDR, &ifr))
+ return -1;
+ if (ifr.ifr_hwaddr.sa_family != ARPHRD_ETHER)
+ return -1;
+ if (setsockopt(udp_sock, SOL_SOCKET, SO_BINDTODEVICE, ifr.ifr_name, strlen(ifr.ifr_name)+1) < 0)
+ return -1;
+
+ dst.sin_family = AF_INET;
+ dst.sin_port = htons(1025);
+ dst.sin_addr.s_addr = addr;
+ if (connect(udp_sock, (struct sockaddr*)&dst, sizeof(dst)) < 0)
+ return -1;
+ len = sizeof(dst);
+ if (getsockname(udp_sock, (struct sockaddr*)&dst, &len) < 0)
+ return -1;
+
+ ah->ar_hrd = htons(ifr.ifr_hwaddr.sa_family);
+ ah->ar_pro = htons(ETH_P_IP);
+ ah->ar_hln = 6;
+ ah->ar_pln = 4;
+ ah->ar_op = htons(ARPOP_REQUEST);
+
+ memcpy(p, ifr.ifr_hwaddr.sa_data, ah->ar_hln);
+ p += ah->ar_hln;
+
+ memcpy(p, &dst.sin_addr, 4);
+ p+=4;
+
+ sll.sll_family = AF_PACKET;
+ memset(sll.sll_addr, 0xFF, sizeof(sll.sll_addr));
+ sll.sll_ifindex = ifindex;
+ sll.sll_protocol = htons(ETH_P_ARP);
+ memcpy(p, &sll.sll_addr, ah->ar_hln);
+ p+=ah->ar_hln;
+
+ memcpy(p, &addr, 4);
+ p+=4;
+
+ len = sendto(pset[0].fd, buf, p-buf, 0, (struct sockaddr*)&sll, sizeof(sll));
+ if (len < 0)
+ return -1;
+ stats.probes_sent++;
+ return 0;
+}
+
+/* Be very tough on sending probes: 1 per second with burst of 3. */
+
+int queue_active_probe(int ifindex, __u32 addr)
+{
+ static struct timeval prev;
+ static int buckets;
+ struct timeval now;
+
+ gettimeofday(&now, NULL);
+ if (prev.tv_sec) {
+ int diff = (now.tv_sec-prev.tv_sec)*1000+(now.tv_usec-prev.tv_usec)/1000;
+ buckets += diff;
+ } else {
+ buckets = broadcast_burst;
+ }
+ if (buckets > broadcast_burst)
+ buckets = broadcast_burst;
+ if (buckets >= broadcast_rate && !send_probe(ifindex, addr)) {
+ buckets -= broadcast_rate;
+ prev = now;
+ return 0;
+ }
+ stats.probes_suppressed++;
+ return -1;
+}
+
+int respond_to_kernel(int ifindex, __u32 addr, char *lla, int llalen)
+{
+ struct {
+ struct nlmsghdr n;
+ struct ndmsg ndm;
+ char buf[256];
+ } req;
+
+ memset(&req.n, 0, sizeof(req.n));
+ memset(&req.ndm, 0, sizeof(req.ndm));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST;
+ req.n.nlmsg_type = RTM_NEWNEIGH;
+ req.ndm.ndm_family = AF_INET;
+ req.ndm.ndm_state = NUD_STALE;
+ req.ndm.ndm_ifindex = ifindex;
+ req.ndm.ndm_type = RTN_UNICAST;
+
+ addattr_l(&req.n, sizeof(req), NDA_DST, &addr, 4);
+ addattr_l(&req.n, sizeof(req), NDA_LLADDR, lla, llalen);
+ return rtnl_send(&rth, (char*)&req, req.n.nlmsg_len) <= 0;
+}
+
+void prepare_neg_entry(__u8 *ndata, __u32 stamp)
+{
+ ndata[0] = 0xFF;
+ ndata[1] = 0;
+ ndata[2] = stamp>>24;
+ ndata[3] = stamp>>16;
+ ndata[4] = stamp>>8;
+ ndata[5] = stamp;
+}
+
+
+int do_one_request(struct nlmsghdr *n)
+{
+ struct ndmsg *ndm = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * tb[NDA_MAX+1];
+ struct dbkey key;
+ DBT dbkey, dbdat;
+ int do_acct = 0;
+
+ if (n->nlmsg_type == NLMSG_DONE) {
+ dbase->sync(dbase, 0);
+
+ /* Now we have at least mirror of kernel db, so that
+ * may start real resolution.
+ */
+ do_sysctl_adjustments();
+ return 0;
+ }
+
+ if (n->nlmsg_type != RTM_GETNEIGH && n->nlmsg_type != RTM_NEWNEIGH)
+ return 0;
+
+ len -= NLMSG_LENGTH(sizeof(*ndm));
+ if (len < 0)
+ return -1;
+
+ if (ndm->ndm_family != AF_INET ||
+ (ifnum && !handle_if(ndm->ndm_ifindex)) ||
+ ndm->ndm_flags ||
+ ndm->ndm_type != RTN_UNICAST ||
+ !(ndm->ndm_state&~NUD_NOARP))
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, NDA_MAX, NDA_RTA(ndm), len);
+
+ if (!tb[NDA_DST])
+ return 0;
+
+ key.iface = ndm->ndm_ifindex;
+ memcpy(&key.addr, RTA_DATA(tb[NDA_DST]), 4);
+ dbkey.data = &key;
+ dbkey.size = sizeof(key);
+
+ if (dbase->get(dbase, &dbkey, &dbdat, 0) != 0) {
+ dbdat.data = 0;
+ dbdat.size = 0;
+ }
+
+ if (n->nlmsg_type == RTM_GETNEIGH) {
+ if (!(n->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ if (!(ndm->ndm_state&(NUD_PROBE|NUD_INCOMPLETE))) {
+ stats.app_bad++;
+ return 0;
+ }
+
+ if (ndm->ndm_state&NUD_PROBE) {
+ /* If we get this, kernel still has some valid
+ * address, but unicast probing failed and host
+ * is either dead or changed its mac address.
+ * Kernel is going to initiate broadcast resolution.
+ * OK, we invalidate our information as well.
+ */
+ if (dbdat.data && !IS_NEG(dbdat.data))
+ stats.app_neg++;
+
+ dbase->del(dbase, &dbkey, 0);
+ } else {
+ /* If we get this kernel does not have any information.
+ * If we have something tell this to kernel. */
+ stats.app_recv++;
+ if (dbdat.data && !IS_NEG(dbdat.data)) {
+ stats.app_success++;
+ respond_to_kernel(key.iface, key.addr, dbdat.data, dbdat.size);
+ return 0;
+ }
+
+ /* Sheeit! We have nothing to tell. */
+ /* If we have recent negative entry, be silent. */
+ if (dbdat.data && NEG_VALID(dbdat.data)) {
+ if (NEG_CNT(dbdat.data) >= active_probing) {
+ stats.app_suppressed++;
+ return 0;
+ }
+ do_acct = 1;
+ }
+ }
+
+ if (active_probing &&
+ queue_active_probe(ndm->ndm_ifindex, key.addr) == 0 &&
+ do_acct) {
+ NEG_CNT(dbdat.data)++;
+ dbase->put(dbase, &dbkey, &dbdat, 0);
+ }
+ } else if (n->nlmsg_type == RTM_NEWNEIGH) {
+ if (n->nlmsg_flags&NLM_F_REQUEST)
+ return 0;
+
+ if (ndm->ndm_state&NUD_FAILED) {
+ /* Kernel was not able to resolve. Host is dead.
+ * Create negative entry if it is not present
+ * or renew it if it is too old. */
+ if (!dbdat.data ||
+ !IS_NEG(dbdat.data) ||
+ !NEG_VALID(dbdat.data)) {
+ __u8 ndata[6];
+ stats.kern_neg++;
+ prepare_neg_entry(ndata, time(NULL));
+ dbdat.data = ndata;
+ dbdat.size = sizeof(ndata);
+ dbase->put(dbase, &dbkey, &dbdat, 0);
+ }
+ } else if (tb[NDA_LLADDR]) {
+ if (dbdat.data && !IS_NEG(dbdat.data)) {
+ if (memcmp(RTA_DATA(tb[NDA_LLADDR]), dbdat.data, dbdat.size) == 0)
+ return 0;
+ stats.kern_change++;
+ } else {
+ stats.kern_new++;
+ }
+ dbdat.data = RTA_DATA(tb[NDA_LLADDR]);
+ dbdat.size = RTA_PAYLOAD(tb[NDA_LLADDR]);
+ dbase->put(dbase, &dbkey, &dbdat, 0);
+ }
+ }
+ return 0;
+}
+
+void load_initial_table(void)
+{
+ rtnl_wilddump_request(&rth, AF_INET, RTM_GETNEIGH);
+}
+
+void get_kern_msg(void)
+{
+ int status;
+ struct nlmsghdr *h;
+ struct sockaddr_nl nladdr;
+ struct iovec iov;
+ char buf[8192];
+ struct msghdr msg = {
+ (void*)&nladdr, sizeof(nladdr),
+ &iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ memset(&nladdr, 0, sizeof(nladdr));
+
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ status = recvmsg(rth.fd, &msg, MSG_DONTWAIT);
+
+ if (status <= 0)
+ return;
+
+ if (msg.msg_namelen != sizeof(nladdr))
+ return;
+
+ if (nladdr.nl_pid)
+ return;
+
+ for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) {
+ int len = h->nlmsg_len;
+ int l = len - sizeof(*h);
+
+ if (l < 0 || len > status)
+ return;
+
+ if (do_one_request(h) < 0)
+ return;
+
+ status -= NLMSG_ALIGN(len);
+ h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+ }
+}
+
+/* Receive gratuitous ARP messages and store them, that's all. */
+void get_arp_pkt(void)
+{
+ unsigned char buf[1024];
+ struct sockaddr_ll sll;
+ int sll_len = sizeof(sll);
+ struct arphdr *a = (struct arphdr*)buf;
+ struct dbkey key;
+ DBT dbkey, dbdat;
+ int n;
+
+ n = recvfrom(pset[0].fd, buf, sizeof(buf), MSG_DONTWAIT, (struct sockaddr*)&sll, &sll_len);
+ if (n < 0) {
+ if (errno != EINTR && errno != EAGAIN)
+ syslog(LOG_ERR, "recvfrom: %m");
+ return;
+ }
+
+ if (ifnum && !handle_if(sll.sll_ifindex))
+ return;
+
+ /* Sanity checks */
+
+ if (n < sizeof(*a) ||
+ (a->ar_op != htons(ARPOP_REQUEST) &&
+ a->ar_op != htons(ARPOP_REPLY)) ||
+ a->ar_pln != 4 ||
+ a->ar_pro != htons(ETH_P_IP) ||
+ a->ar_hln != sll.sll_halen ||
+ sizeof(*a) + 2*4 + 2*a->ar_hln > n)
+ return;
+
+ key.iface = sll.sll_ifindex;
+ memcpy(&key.addr, (char*)(a+1) + a->ar_hln, 4);
+
+ /* DAD message, ignore. */
+ if (key.addr == 0)
+ return;
+
+ dbkey.data = &key;
+ dbkey.size = sizeof(key);
+
+ if (dbase->get(dbase, &dbkey, &dbdat, 0) == 0 && !IS_NEG(dbdat.data)) {
+ if (memcmp(dbdat.data, a+1, dbdat.size) == 0)
+ return;
+ stats.arp_change++;
+ } else {
+ stats.arp_new++;
+ }
+
+ dbdat.data = a+1;
+ dbdat.size = a->ar_hln;
+ dbase->put(dbase, &dbkey, &dbdat, 0);
+}
+
+void catch_signal(int sig, void (*handler)(int))
+{
+ struct sigaction sa;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = handler;
+#ifdef SA_INTERRUPT
+ sa.sa_flags = SA_INTERRUPT;
+#endif
+ sigaction(sig, &sa, NULL);
+}
+
+#include <setjmp.h>
+sigjmp_buf env;
+volatile int in_poll;
+
+void sig_exit(int signo)
+{
+ do_exit = 1;
+ if (in_poll)
+ siglongjmp(env, 1);
+}
+
+void sig_sync(int signo)
+{
+ do_sync = 1;
+ if (in_poll)
+ siglongjmp(env, 1);
+}
+
+void sig_stats(int signo)
+{
+ do_sync = 1;
+ do_stats = 1;
+ if (in_poll)
+ siglongjmp(env, 1);
+}
+
+void send_stats(void)
+{
+ syslog(LOG_INFO, "arp_rcv: n%lu c%lu app_rcv: tot %lu hits %lu bad %lu neg %lu sup %lu",
+ stats.arp_new, stats.arp_change,
+
+ stats.app_recv, stats.app_success,
+ stats.app_bad, stats.app_neg, stats.app_suppressed
+ );
+ syslog(LOG_INFO, "kern: n%lu c%lu neg %lu arp_send: %lu rlim %lu",
+ stats.kern_new, stats.kern_change, stats.kern_neg,
+
+ stats.probes_sent, stats.probes_suppressed
+ );
+ do_stats = 0;
+}
+
+
+int main(int argc, char **argv)
+{
+ int opt;
+ int do_list = 0;
+ char *do_load = NULL;
+
+ while ((opt = getopt(argc, argv, "h?b:lf:a:n:kR:B:")) != EOF) {
+ switch (opt) {
+ case 'b':
+ dbname = optarg;
+ break;
+ case 'f':
+ if (do_load) {
+ fprintf(stderr, "Duplicate option -f\n");
+ usage();
+ }
+ do_load = optarg;
+ break;
+ case 'l':
+ do_list = 1;
+ break;
+ case 'a':
+ active_probing = atoi(optarg);
+ break;
+ case 'n':
+ negative_timeout = atoi(optarg);
+ break;
+ case 'k':
+ no_kernel_broadcasts = 1;
+ break;
+ case 'R':
+ if ((broadcast_rate = atoi(optarg)) <= 0 ||
+ (broadcast_rate = 1000/broadcast_rate) <= 0) {
+ fprintf(stderr, "Invalid ARP rate\n");
+ exit(-1);
+ }
+ break;
+ case 'B':
+ if ((broadcast_burst = atoi(optarg)) <= 0 ||
+ (broadcast_burst = 1000*broadcast_burst) <= 0) {
+ fprintf(stderr, "Invalid ARP burst\n");
+ exit(-1);
+ }
+ break;
+ case 'h':
+ case '?':
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc > 0) {
+ ifnum = argc;
+ ifnames = argv;
+ ifvec = malloc(argc*sizeof(int));
+ if (!ifvec) {
+ perror("malloc");
+ exit(-1);
+ }
+ }
+
+ if ((udp_sock = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
+ perror("socket");
+ exit(-1);
+ }
+
+ if (ifnum) {
+ int i;
+ struct ifreq ifr;
+ memset(&ifr, 0, sizeof(ifr));
+ for (i=0; i<ifnum; i++) {
+ strncpy(ifr.ifr_name, ifnames[i], IFNAMSIZ);
+ if (ioctl(udp_sock, SIOCGIFINDEX, &ifr)) {
+ perror("ioctl(SIOCGIFINDEX)");
+ exit(-1);;
+ }
+ ifvec[i] = ifr.ifr_ifindex;
+ }
+ }
+
+ dbase = dbopen(dbname, O_CREAT|O_RDWR, 0644, DB_HASH, NULL);
+ if (dbase == NULL) {
+ perror("db_open");
+ exit(-1);
+ }
+
+ if (do_load) {
+ char buf[128];
+ FILE *fp;
+ struct dbkey k;
+ DBT dbkey, dbdat;
+
+ dbkey.data = &k;
+ dbkey.size = sizeof(k);
+
+ if (strcmp(do_load, "-") == 0 || strcmp(do_load, "--") == 0) {
+ fp = stdin;
+ } else if ((fp = fopen(do_load, "r")) == NULL) {
+ perror("fopen");
+ goto do_abort;
+ }
+
+ buf[sizeof(buf)-1] = 0;
+ while (fgets(buf, sizeof(buf)-1, fp)) {
+ __u8 b1[6];
+ char ipbuf[128];
+ char macbuf[128];
+
+ if (buf[0] == '#')
+ continue;
+
+ if (sscanf(buf, "%u%s%s", &k.iface, ipbuf, macbuf) != 3) {
+ fprintf(stderr, "Wrong format of input file \"%s\"\n", do_load);
+ goto do_abort;
+ }
+ if (strncmp(macbuf, "FAILED:", 7) == 0)
+ continue;
+ if (!inet_aton(ipbuf, (struct in_addr*)&k.addr)) {
+ fprintf(stderr, "Invalid IP address: \"%s\"\n", ipbuf);
+ goto do_abort;
+ }
+ dbdat.data = hexstring_a2n(macbuf, b1, 6);
+ if (dbdat.data == NULL)
+ goto do_abort;
+ dbdat.size = 6;
+
+ if (dbase->put(dbase, &dbkey, &dbdat, 0)) {
+ perror("hash->put");
+ goto do_abort;
+ }
+ }
+ dbase->sync(dbase, 0);
+ if (fp != stdin)
+ fclose(fp);
+ }
+
+ if (do_list) {
+ DBT dbkey, dbdat;
+ printf("%-8s %-15s %s\n", "#Ifindex", "IP", "MAC");
+ while (dbase->seq(dbase, &dbkey, &dbdat, R_NEXT) == 0) {
+ struct dbkey *key = dbkey.data;
+ if (handle_if(key->iface)) {
+ if (!IS_NEG(dbdat.data)) {
+ __u8 b1[18];
+ printf("%-8d %-15s %s\n",
+ key->iface,
+ inet_ntoa(*(struct in_addr*)&key->addr),
+ hexstring_n2a(dbdat.data, 6, b1, 18));
+ } else {
+ printf("%-8d %-15s FAILED: %dsec ago\n",
+ key->iface,
+ inet_ntoa(*(struct in_addr*)&key->addr),
+ NEG_AGE(dbdat.data));
+ }
+ }
+ }
+ }
+
+ if (do_load || do_list)
+ goto out;
+
+ pset[0].fd = socket(PF_PACKET, SOCK_DGRAM, 0);
+ if (pset[0].fd < 0) {
+ perror("socket");
+ exit(-1);
+ }
+
+ if (1) {
+ struct sockaddr_ll sll;
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_protocol = htons(ETH_P_ARP);
+ sll.sll_ifindex = (ifnum == 1 ? ifvec[0] : 0);
+ if (bind(pset[0].fd, (struct sockaddr*)&sll, sizeof(sll)) < 0) {
+ perror("bind");
+ goto do_abort;
+ }
+ }
+
+ if (rtnl_open(&rth, RTMGRP_NEIGH) < 0) {
+ perror("rtnl_open");
+ goto do_abort;
+ }
+ pset[1].fd = rth.fd;
+
+ load_initial_table();
+
+ if (1) {
+ int fd;
+ pid_t pid = fork();
+
+ if (pid > 0)
+ _exit(0);
+ if (pid < 0) {
+ perror("arpd: fork");
+ goto do_abort;
+ }
+
+ chdir("/");
+ fd = open("/dev/null", O_RDWR);
+ if (fd >= 0) {
+ dup2(fd, 0);
+ dup2(fd, 1);
+ dup2(fd, 2);
+ if (fd > 2)
+ close(fd);
+ }
+ setsid();
+ }
+
+ openlog("arpd", LOG_PID | LOG_CONS, LOG_DAEMON);
+ catch_signal(SIGINT, sig_exit);
+ catch_signal(SIGTERM, sig_exit);
+ catch_signal(SIGHUP, sig_sync);
+ catch_signal(SIGUSR1, sig_stats);
+
+#define EVENTS (POLLIN|POLLPRI|POLLERR|POLLHUP)
+ pset[0].events = EVENTS;
+ pset[0].revents = 0;
+ pset[1].events = EVENTS;
+ pset[1].revents = 0;
+
+ sigsetjmp(env, 1);
+
+ for (;;) {
+ in_poll = 1;
+
+ if (do_exit)
+ break;
+ if (do_sync) {
+ in_poll = 0;
+ dbase->sync(dbase, 0);
+ do_sync = 0;
+ in_poll = 1;
+ }
+ if (do_stats)
+ send_stats();
+ if (poll(pset, 2, 30000) > 0) {
+ in_poll = 0;
+ if (pset[0].revents&EVENTS)
+ get_arp_pkt();
+ if (pset[1].revents&EVENTS)
+ get_kern_msg();
+ } else {
+ do_sync = 1;
+ }
+ }
+
+ undo_sysctl_adjustments();
+out:
+ dbase->close(dbase);
+ exit(0);
+
+do_abort:
+ dbase->close(dbase);
+ exit(-1);
+}
diff --git a/misc/ifstat.c b/misc/ifstat.c
index e69de29b..67489b9a 100644
--- a/misc/ifstat.c
+++ b/misc/ifstat.c
@@ -0,0 +1,729 @@
+/*
+ * ifstat.c handy utility to read net interface statistics
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+#include <fnmatch.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <math.h>
+
+#include <libnetlink.h>
+#include <linux/netdevice.h>
+
+#include <SNAPSHOT.h>
+
+int dump_zeros = 0;
+int reset_history = 0;
+int ignore_history = 0;
+int no_output = 0;
+int no_update = 0;
+int scan_interval = 0;
+int time_constant = 0;
+int show_errors = 0;
+double W;
+char **patterns;
+int npatterns;
+
+char info_source[128];
+int source_mismatch;
+
+#define MAXS (sizeof(struct net_device_stats)/sizeof(unsigned long))
+
+struct ifstat_ent
+{
+ struct ifstat_ent *next;
+ char *name;
+ int ifindex;
+ unsigned long long val[MAXS];
+ double rate[MAXS];
+ unsigned long ival[MAXS];
+};
+
+struct ifstat_ent *kern_db;
+struct ifstat_ent *hist_db;
+
+int match(char *id)
+{
+ int i;
+
+ if (npatterns == 0)
+ return 1;
+
+ for (i=0; i<npatterns; i++) {
+ if (!fnmatch(patterns[i], id, 0))
+ return 1;
+ }
+ return 0;
+}
+
+int get_nlmsg(struct sockaddr_nl *who, struct nlmsghdr *m, void *arg)
+{
+ struct ifinfomsg *ifi = NLMSG_DATA(m);
+ struct rtattr * tb[IFLA_MAX+1];
+ int len = m->nlmsg_len;
+ struct ifstat_ent *n;
+ int i;
+
+ if (m->nlmsg_type != RTM_NEWLINK)
+ return 0;
+
+ len -= NLMSG_LENGTH(sizeof(*ifi));
+ if (len < 0)
+ return -1;
+
+ if (!(ifi->ifi_flags&IFF_UP))
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
+ if (tb[IFLA_IFNAME] == NULL || tb[IFLA_STATS] == NULL)
+ return 0;
+
+ n = malloc(sizeof(*n));
+ if (!n)
+ abort();
+ n->ifindex = ifi->ifi_index;
+ n->name = strdup(RTA_DATA(tb[IFLA_IFNAME]));
+ memcpy(&n->ival, RTA_DATA(tb[IFLA_STATS]), sizeof(n->ival));
+ memset(&n->rate, 0, sizeof(n->rate));
+ for (i=0; i<MAXS; i++)
+ n->val[i] = n->ival[i];
+ n->next = kern_db;
+ kern_db = n;
+ return 0;
+}
+
+void load_info(void)
+{
+ struct ifstat_ent *db, *n;
+ struct rtnl_handle rth;
+
+ if (rtnl_open(&rth, 0) < 0)
+ exit(1);
+
+ if (rtnl_wilddump_request(&rth, AF_INET, RTM_GETLINK) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, get_nlmsg, NULL, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ rtnl_close(&rth);
+
+ db = kern_db;
+ kern_db = NULL;
+
+ while (db) {
+ n = db;
+ db = db->next;
+ n->next = kern_db;
+ kern_db = n;
+ }
+}
+
+void load_raw_table(FILE *fp)
+{
+ char buf[4096];
+ struct ifstat_ent *db = NULL;
+ struct ifstat_ent *n;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ char *p;
+ char *next;
+ int i;
+
+ if (buf[0] == '#') {
+ buf[strlen(buf)-1] = 0;
+ if (info_source[0] && strcmp(info_source, buf+1))
+ source_mismatch = 1;
+ strncpy(info_source, buf+1, sizeof(info_source)-1);
+ continue;
+ }
+ if ((n = malloc(sizeof(*n))) == NULL)
+ abort();
+
+ if (!(p = strchr(buf, ' ')))
+ abort();
+ *p++ = 0;
+
+ if (sscanf(buf, "%d", &n->ifindex) != 1)
+ abort();
+ if (!(next = strchr(p, ' ')))
+ abort();
+ *next++ = 0;
+
+ n->name = strdup(p);
+ p = next;
+
+ for (i=0; i<MAXS; i++) {
+ unsigned rate;
+ if (!(next = strchr(p, ' ')))
+ abort();
+ *next++ = 0;
+ if (sscanf(p, "%llu", n->val+i) != 1)
+ abort();
+ n->ival[i] = (unsigned long)n->val[i];
+ p = next;
+ if (!(next = strchr(p, ' ')))
+ abort();
+ *next++ = 0;
+ if (sscanf(p, "%u", &rate) != 1)
+ abort();
+ n->rate[i] = rate;
+ p = next;
+ }
+ n->next = db;
+ db = n;
+ }
+
+ while (db) {
+ n = db;
+ db = db->next;
+ n->next = kern_db;
+ kern_db = n;
+ }
+}
+
+void dump_raw_db(FILE *fp, int to_hist)
+{
+ struct ifstat_ent *n, *h;
+ h = hist_db;
+ fprintf(fp, "#%s\n", info_source);
+
+ for (n=kern_db; n; n=n->next) {
+ int i;
+ unsigned long long *vals = n->val;
+ double *rates = n->rate;
+ if (!match(n->name)) {
+ struct ifstat_ent *h1;
+ if (!to_hist)
+ continue;
+ for (h1 = h; h1; h1 = h1->next) {
+ if (h1->ifindex == n->ifindex) {
+ vals = h1->val;
+ rates = h1->rate;
+ h = h1->next;
+ break;
+ }
+ }
+ }
+ fprintf(fp, "%d %s ", n->ifindex, n->name);
+ for (i=0; i<MAXS; i++)
+ fprintf(fp, "%llu %u ", vals[i], (unsigned)rates[i]);
+ fprintf(fp, "\n");
+ }
+}
+
+
+void format_rate(FILE *fp, unsigned long long *vals, double *rates, int i)
+{
+ char temp[64];
+ if (vals[i] > 1024*1024*1024)
+ fprintf(fp, "%7lluM ", vals[i]/(1024*1024));
+ else if (vals[i] > 1024*1024)
+ fprintf(fp, "%7lluK ", vals[i]/1024);
+ else
+ fprintf(fp, "%8llu ", vals[i]);
+
+ if (rates[i] > 1024*1024) {
+ sprintf(temp, "%uM", (unsigned)(rates[i]/(1024*1024)));
+ fprintf(fp, "%-6s ", temp);
+ } else if (rates[i] > 1024) {
+ sprintf(temp, "%uK", (unsigned)(rates[i]/1024));
+ fprintf(fp, "%-6s ", temp);
+ } else
+ fprintf(fp, "%-6u ", (unsigned)rates[i]);
+}
+
+void format_pair(FILE *fp, unsigned long long *vals, int i, int k)
+{
+ char temp[64];
+ if (vals[i] > 1024*1024*1024)
+ fprintf(fp, "%7lluM ", vals[i]/(1024*1024));
+ else if (vals[i] > 1024*1024)
+ fprintf(fp, "%7lluK ", vals[i]/1024);
+ else
+ fprintf(fp, "%8llu ", vals[i]);
+
+ if (vals[k] > 1024*1024*1024) {
+ sprintf(temp, "%uM", (unsigned)(vals[k]/(1024*1024)));
+ fprintf(fp, "%-6s ", temp);
+ } else if (vals[k] > 1024*1024) {
+ sprintf(temp, "%uK", (unsigned)(vals[k]/1024));
+ fprintf(fp, "%-6s ", temp);
+ } else
+ fprintf(fp, "%-6u ", (unsigned)vals[k]);
+}
+
+void print_head(FILE *fp)
+{
+ fprintf(fp, "#%s\n", info_source);
+ fprintf(fp, "%-15s ", "Interface");
+
+ fprintf(fp, "%8s/%-6s ", "RX Pkts", "Rate");
+ fprintf(fp, "%8s/%-6s ", "TX Pkts", "Rate");
+ fprintf(fp, "%8s/%-6s ", "RX Data", "Rate");
+ fprintf(fp, "%8s/%-6s\n","TX Data", "Rate");
+
+ if (!show_errors) {
+ fprintf(fp, "%-15s ", "");
+ fprintf(fp, "%8s/%-6s ", "RX Errs", "Drop");
+ fprintf(fp, "%8s/%-6s ", "TX Errs", "Drop");
+ fprintf(fp, "%8s/%-6s ", "RX Over", "Rate");
+ fprintf(fp, "%8s/%-6s\n","TX Coll", "Rate");
+ } else {
+ fprintf(fp, "%-15s ", "");
+ fprintf(fp, "%8s/%-6s ", "RX Errs", "Rate");
+ fprintf(fp, "%8s/%-6s ", "RX Drop", "Rate");
+ fprintf(fp, "%8s/%-6s ", "RX Over", "Rate");
+ fprintf(fp, "%8s/%-6s\n","RX Leng", "Rate");
+
+ fprintf(fp, "%-15s ", "");
+ fprintf(fp, "%8s/%-6s ", "RX Crc", "Rate");
+ fprintf(fp, "%8s/%-6s ", "RX Frm", "Rate");
+ fprintf(fp, "%8s/%-6s ", "RX Fifo", "Rate");
+ fprintf(fp, "%8s/%-6s\n","RX Miss", "Rate");
+
+ fprintf(fp, "%-15s ", "");
+ fprintf(fp, "%8s/%-6s ", "TX Errs", "Rate");
+ fprintf(fp, "%8s/%-6s ", "TX Drop", "Rate");
+ fprintf(fp, "%8s/%-6s ", "TX Coll", "Rate");
+ fprintf(fp, "%8s/%-6s\n","TX Carr", "Rate");
+
+ fprintf(fp, "%-15s ", "");
+ fprintf(fp, "%8s/%-6s ", "TX Abrt", "Rate");
+ fprintf(fp, "%8s/%-6s ", "TX Fifo", "Rate");
+ fprintf(fp, "%8s/%-6s ", "TX Hear", "Rate");
+ fprintf(fp, "%8s/%-6s\n","TX Wind", "Rate");
+ }
+}
+
+void print_one_if(FILE *fp, struct ifstat_ent *n, unsigned long long *vals)
+{
+ int i;
+ fprintf(fp, "%-15s ", n->name);
+ for (i=0; i<4; i++)
+ format_rate(fp, vals, n->rate, i);
+ fprintf(fp, "\n");
+
+ if (!show_errors) {
+ fprintf(fp, "%-15s ", "");
+ format_pair(fp, vals, 4, 6);
+ format_pair(fp, vals, 5, 7);
+ format_rate(fp, vals, n->rate, 11);
+ format_rate(fp, vals, n->rate, 9);
+ fprintf(fp, "\n");
+ } else {
+ fprintf(fp, "%-15s ", "");
+ format_rate(fp, vals, n->rate, 4);
+ format_rate(fp, vals, n->rate, 6);
+ format_rate(fp, vals, n->rate, 11);
+ format_rate(fp, vals, n->rate, 10);
+ fprintf(fp, "\n");
+
+ fprintf(fp, "%-15s ", "");
+ format_rate(fp, vals, n->rate, 12);
+ format_rate(fp, vals, n->rate, 13);
+ format_rate(fp, vals, n->rate, 14);
+ format_rate(fp, vals, n->rate, 15);
+ fprintf(fp, "\n");
+
+ fprintf(fp, "%-15s ", "");
+ format_rate(fp, vals, n->rate, 5);
+ format_rate(fp, vals, n->rate, 7);
+ format_rate(fp, vals, n->rate, 9);
+ format_rate(fp, vals, n->rate, 17);
+ fprintf(fp, "\n");
+
+ fprintf(fp, "%-15s ", "");
+ format_rate(fp, vals, n->rate, 16);
+ format_rate(fp, vals, n->rate, 18);
+ format_rate(fp, vals, n->rate, 19);
+ format_rate(fp, vals, n->rate, 20);
+ fprintf(fp, "\n");
+ }
+}
+
+
+void dump_kern_db(FILE *fp)
+{
+ struct ifstat_ent *n, *h;
+ h = hist_db;
+
+ print_head(fp);
+
+ for (n=kern_db; n; n=n->next) {
+ if (!match(n->name))
+ continue;
+ print_one_if(fp, n, n->val);
+ }
+}
+
+
+void dump_incr_db(FILE *fp)
+{
+ struct ifstat_ent *n, *h;
+ h = hist_db;
+
+ print_head(fp);
+
+ for (n=kern_db; n; n=n->next) {
+ int i;
+ unsigned long long vals[MAXS];
+ struct ifstat_ent *h1;
+
+ memcpy(vals, n->val, sizeof(vals));
+
+ for (h1 = h; h1; h1 = h1->next) {
+ if (h1->ifindex == n->ifindex) {
+ for (i = 0; i < MAXS; i++)
+ vals[i] -= h1->val[i];
+ h = h1->next;
+ break;
+ }
+ }
+ if (!match(n->name))
+ continue;
+ print_one_if(fp, n, vals);
+ }
+}
+
+
+static int children;
+
+void sigchild(int signo)
+{
+}
+
+void update_db(int interval)
+{
+ struct ifstat_ent *n, *h;
+
+ n = kern_db;
+ kern_db = NULL;
+
+ load_info();
+
+ h = kern_db;
+ kern_db = n;
+
+ for (n = kern_db; n; n = n->next) {
+ struct ifstat_ent *h1;
+ for (h1 = h; h1; h1 = h1->next) {
+ if (h1->ifindex == n->ifindex) {
+ int i;
+ for (i = 0; i < MAXS; i++) {
+ if ((long)(h1->ival[i] - n->ival[i]) < 0) {
+ memset(n->ival, 0, sizeof(n->ival));
+ break;
+ }
+ }
+ for (i = 0; i < MAXS; i++) {
+ double sample;
+ unsigned long incr = h1->ival[i] - n->ival[i];
+ n->val[i] += incr;
+ n->ival[i] = h1->ival[i];
+ sample = (double)(incr*1000)/interval;
+ if (interval >= scan_interval) {
+ n->rate[i] += W*(sample-n->rate[i]);
+ } else if (interval >= 1000) {
+ if (interval >= time_constant) {
+ n->rate[i] = sample;
+ } else {
+ double w = W*(double)interval/scan_interval;
+ n->rate[i] += w*(sample-n->rate[i]);
+ }
+ }
+ }
+
+ while (h != h1) {
+ struct ifstat_ent *tmp = h;
+ h = h->next;
+ free(tmp->name);
+ free(tmp);
+ };
+ h = h1->next;
+ free(h1->name);
+ free(h1);
+ break;
+ }
+ }
+ }
+}
+
+#define T_DIFF(a,b) (((a).tv_sec-(b).tv_sec)*1000 + ((a).tv_usec-(b).tv_usec)/1000)
+
+
+void server_loop(int fd)
+{
+ struct timeval snaptime;
+ struct pollfd p;
+ p.fd = fd;
+ p.events = p.revents = POLLIN;
+
+ sprintf(info_source, "%d.%lu sampling_interval=%d time_const=%d",
+ getpid(), (unsigned long)random(), scan_interval/1000, time_constant/1000);
+
+ load_info();
+
+ for (;;) {
+ int status;
+ int tdiff;
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ tdiff = T_DIFF(now, snaptime);
+ if (tdiff >= scan_interval) {
+ update_db(tdiff);
+ snaptime = now;
+ tdiff = 0;
+ }
+ if (poll(&p, 1, tdiff + scan_interval) > 0
+ && (p.revents&POLLIN)) {
+ int clnt = accept(fd, NULL, NULL);
+ if (clnt >= 0) {
+ pid_t pid;
+ if (children >= 5) {
+ close(clnt);
+ } else if ((pid = fork()) != 0) {
+ if (pid>0)
+ children++;
+ close(clnt);
+ } else {
+ FILE *fp = fdopen(clnt, "w");
+ if (fp) {
+ if (tdiff > 0)
+ update_db(tdiff);
+ dump_raw_db(fp, 0);
+ }
+ exit(0);
+ }
+ }
+ }
+ while (children && waitpid(-1, &status, WNOHANG) > 0)
+ children--;
+ }
+}
+
+int verify_forging(int fd)
+{
+ struct ucred cred;
+ int olen = sizeof(cred);
+ if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, (void*)&cred, &olen) ||
+ olen < sizeof(cred))
+ return -1;
+ if (cred.uid == getuid() || cred.uid == 0)
+ return 0;
+ return -1;
+}
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr,
+"Usage: ifstat [ -h?vVzrnasd:t: ] [ PATTERN [ PATTERN ] ]\n"
+ );
+ exit(-1);
+}
+
+
+int main(int argc, char *argv[])
+{
+ char hist_name[128];
+ struct sockaddr_un sun;
+ FILE *hist_fp = NULL;
+ int ch;
+ int fd;
+
+ while ((ch = getopt(argc, argv, "h?vVzrnasd:t:e")) != EOF) {
+ switch(ch) {
+ case 'z':
+ dump_zeros = 1;
+ break;
+ case 'r':
+ reset_history = 1;
+ break;
+ case 'a':
+ ignore_history = 1;
+ break;
+ case 's':
+ no_update = 1;
+ break;
+ case 'n':
+ no_output = 1;
+ break;
+ case 'e':
+ show_errors = 1;
+ break;
+ case 'd':
+ scan_interval = 1000*atoi(optarg);
+ break;
+ case 't':
+ if (sscanf(optarg, "%d", &time_constant) != 1 ||
+ time_constant <= 0) {
+ fprintf(stderr, "ifstat: invalid time constant divisor\n");
+ exit(-1);
+ }
+ break;
+ case 'v':
+ case 'V':
+ printf("ifstat utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ case 'h':
+ case '?':
+ default:
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ sun.sun_family = AF_UNIX;
+ sun.sun_path[0] = 0;
+ sprintf(sun.sun_path+1, "ifstat%d", getuid());
+
+ if (scan_interval > 0) {
+ if (time_constant == 0)
+ time_constant = 60;
+ time_constant *= 1000;
+ W = 1 - 1/exp(log(10)*(double)scan_interval/time_constant);
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("ifstat: socket");
+ exit(-1);
+ }
+ if (bind(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) < 0) {
+ perror("ifstat: bind");
+ exit(-1);
+ }
+ if (listen(fd, 5) < 0) {
+ perror("ifstat: listen");
+ exit(-1);
+ }
+ if (fork())
+ exit(0);
+ chdir("/");
+ close(0); close(1); close(2); setsid();
+ signal(SIGPIPE, SIG_IGN);
+ signal(SIGCHLD, sigchild);
+ server_loop(fd);
+ exit(0);
+ }
+
+ patterns = argv;
+ npatterns = argc;
+
+ if (getenv("IFSTAT_HISTORY"))
+ snprintf(hist_name, sizeof(hist_name), getenv("IFSTAT_HISTORY"));
+ else
+ sprintf(hist_name, "/tmp/.ifstat.u%d", getuid());
+
+ if (reset_history)
+ unlink(hist_name);
+
+ if (!ignore_history || !no_update) {
+ struct stat stb;
+
+ fd = open(hist_name, O_RDWR|O_CREAT|O_NOFOLLOW, 0600);
+ if (fd < 0) {
+ perror("ifstat: open history file");
+ exit(-1);
+ }
+ if ((hist_fp = fdopen(fd, "r+")) == NULL) {
+ perror("ifstat: fdopen history file");
+ exit(-1);
+ }
+ if (flock(fileno(hist_fp), LOCK_EX)) {
+ perror("ifstat: flock history file");
+ exit(-1);
+ }
+ if (fstat(fileno(hist_fp), &stb) != 0) {
+ perror("ifstat: fstat history file");
+ exit(-1);
+ }
+ if (stb.st_nlink != 1 || stb.st_uid != getuid()) {
+ fprintf(stderr, "ifstat: something is so wrong with history file, that I prefer not to proceed.\n");
+ exit(-1);
+ }
+ if (!ignore_history) {
+ FILE *tfp;
+ long uptime;
+ if ((tfp = fopen("/proc/uptime", "r")) != NULL) {
+ if (fscanf(tfp, "%ld", &uptime) != 1)
+ uptime = -1;
+ fclose(tfp);
+ }
+ if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) {
+ fprintf(stderr, "ifstat: history is aged out, resetting\n");
+ ftruncate(fileno(hist_fp), 0);
+ }
+ }
+
+ load_raw_table(hist_fp);
+
+ hist_db = kern_db;
+ kern_db = NULL;
+ }
+
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) >= 0 &&
+ (connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0
+ || (strcpy(sun.sun_path+1, "ifstat0"),
+ connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0))
+ && verify_forging(fd) == 0) {
+ FILE *sfp = fdopen(fd, "r");
+ load_raw_table(sfp);
+ if (hist_db && source_mismatch) {
+ fprintf(stderr, "ifstat: history is stale, ignoring it.\n");
+ hist_db = NULL;
+ }
+ fclose(sfp);
+ } else {
+ if (fd >= 0)
+ close(fd);
+ if (hist_db && info_source[0] && strcmp(info_source, "kernel")) {
+ fprintf(stderr, "ifstat: history is stale, ignoring it.\n");
+ hist_db = NULL;
+ info_source[0] = 0;
+ }
+ load_info();
+ if (info_source[0] == 0)
+ strcpy(info_source, "kernel");
+ }
+
+ if (!no_output) {
+ if (ignore_history || hist_db == NULL)
+ dump_kern_db(stdout);
+ else
+ dump_incr_db(stdout);
+ }
+ if (!no_update) {
+ ftruncate(fileno(hist_fp), 0);
+ rewind(hist_fp);
+ dump_raw_db(hist_fp, 1);
+ fflush(hist_fp);
+ }
+ exit(0);
+}
diff --git a/misc/netbug b/misc/netbug
index e69de29b..6d13c8ee 100644
--- a/misc/netbug
+++ b/misc/netbug
@@ -0,0 +1,53 @@
+#! /bin/bash
+
+echo -n "Send network configuration summary to [ENTER means kuznet@ms2.inr.ac.ru] "
+IFS="" read mail || exit 1
+[ -z "$mail" ] && mail=kuznet@ms2.inr.ac.ru
+
+
+netbug=""
+while [ "$netbug" = "" ]; do
+ netbug=`echo netbug.$$.$RANDOM`
+ if [ -e /tmp/$netbug ]; then
+ netbug=""
+ fi
+done
+
+tmppath=/tmp/$netbug
+
+trap "rm -rf $tmppath $tmppath.tar.gz" 0 SIGINT
+
+mkdir $tmppath
+mkdir $tmppath/net
+
+cat /proc/slabinfo > $tmppath/slabinfo
+cat /proc/net/netstat > $tmppath/net/netstat
+cat /proc/net/unix > $tmppath/net/unix
+cat /proc/net/packet > $tmppath/net/packet
+cat /proc/net/netlink > $tmppath/net/netlink
+cat /proc/net/psched > $tmppath/net/psched
+cat /proc/net/softnet_stat > $tmppath/net/softnet_stat
+cat /proc/net/sockstat > $tmppath/net/sockstat
+cat /proc/net/tcp > $tmppath/net/tcp
+cat /proc/net/udp > $tmppath/net/udp
+cat /proc/net/raw > $tmppath/net/raw
+cat /proc/net/snmp > $tmppath/net/snmp
+
+ss -aioem -D $tmppath/tcpdiag
+
+if [ -e /proc/net/tcp6 ]; then
+ cat /proc/net/sockstat6 > $tmppath/net/sockstat6
+ cat /proc/net/tcp6 > $tmppath/net/tcp6
+ cat /proc/net/udp6 > $tmppath/net/udp6
+ cat /proc/net/raw6 > $tmppath/net/raw6
+ cat /proc/net/snmp6 > $tmppath/net/snmp6
+fi
+
+cd /tmp
+tar c $netbug | gzip -9c > $netbug.tar.gz
+
+uuencode $netbug.tar.gz $netbug.tar.gz | mail -s $netbug "$mail"
+
+echo "Sending to <$mail>; subject is $netbug"
+
+exit 0
diff --git a/misc/nstat.c b/misc/nstat.c
index e69de29b..9580ccf3 100644
--- a/misc/nstat.c
+++ b/misc/nstat.c
@@ -0,0 +1,614 @@
+/*
+ * nstat.c handy utility to read counters /proc/net/netstat and snmp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+#include <fnmatch.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <math.h>
+
+#include <SNAPSHOT.h>
+
+int dump_zeros = 0;
+int reset_history = 0;
+int ignore_history = 0;
+int no_output = 0;
+int no_update = 0;
+int scan_interval = 0;
+int time_constant = 0;
+double W;
+char **patterns;
+int npatterns;
+
+char info_source[128];
+int source_mismatch;
+
+int generic_proc_open(char *env, char *name)
+{
+ char store[128];
+ char *p = getenv(env);
+ if (!p) {
+ p = getenv("PROC_ROOT") ? : "/proc";
+ snprintf(store, sizeof(store)-1, "%s/%s", p, name);
+ p = store;
+ }
+ return open(store, O_RDONLY);
+}
+
+int net_netstat_open(void)
+{
+ return generic_proc_open("PROC_NET_NETSTAT", "net/netstat");
+}
+
+int net_snmp_open(void)
+{
+ return generic_proc_open("PROC_NET_SNMP", "net/snmp");
+}
+
+int net_snmp6_open(void)
+{
+ return generic_proc_open("PROC_NET_SNMP6", "net/snmp6");
+}
+
+struct nstat_ent
+{
+ struct nstat_ent *next;
+ char *id;
+ unsigned long long val;
+ unsigned long ival;
+ double rate;
+};
+
+struct nstat_ent *kern_db;
+struct nstat_ent *hist_db;
+
+char *useless_numbers[] = {
+"IpForwarding", "IpDefaultTTL",
+"TcpRtoAlgorithm", "TcpRtoMin", "TcpRtoMax",
+"TcpMaxConn", "TcpCurrEstab"
+};
+
+int useless_number(char *id)
+{
+ int i;
+ for (i=0; i<sizeof(useless_numbers)/sizeof(*useless_numbers); i++)
+ if (strcmp(id, useless_numbers[i]) == 0)
+ return 1;
+ return 0;
+}
+
+int match(char *id)
+{
+ int i;
+
+ if (npatterns == 0)
+ return 1;
+
+ for (i=0; i<npatterns; i++) {
+ if (!fnmatch(patterns[i], id, 0))
+ return 1;
+ }
+ return 0;
+}
+
+void load_good_table(FILE *fp)
+{
+ char buf[4096];
+ struct nstat_ent *db = NULL;
+ struct nstat_ent *n;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ int nr;
+ unsigned long long val;
+ double rate;
+ char idbuf[256];
+ if (buf[0] == '#') {
+ buf[strlen(buf)-1] = 0;
+ if (info_source[0] && strcmp(info_source, buf+1))
+ source_mismatch = 1;
+ strncpy(info_source, buf+1, sizeof(info_source)-1);
+ continue;
+ }
+ nr = sscanf(buf, "%s%llu%lg", idbuf, &val, &rate);
+ if (nr < 2)
+ abort();
+ if (nr < 3)
+ rate = 0;
+ if (useless_number(idbuf))
+ continue;
+ if ((n = malloc(sizeof(*n))) == NULL)
+ abort();
+ n->id = strdup(idbuf);
+ n->ival = (unsigned long)val;
+ n->val = val;
+ n->rate = rate;
+ n->next = db;
+ db = n;
+ }
+
+ while (db) {
+ n = db;
+ db = db->next;
+ n->next = kern_db;
+ kern_db = n;
+ }
+}
+
+
+void load_ugly_table(FILE *fp)
+{
+ char buf[4096];
+ struct nstat_ent *db = NULL;
+ struct nstat_ent *n;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ char idbuf[256];
+ int off;
+ char *p;
+
+ p = strchr(buf, ':');
+ if (!p)
+ abort();
+ *p = 0;
+ strcpy(idbuf, buf);
+ off = strlen(idbuf);
+ p += 2;
+
+ while (*p) {
+ char *next;
+ if ((next = strchr(p, ' ')) != NULL)
+ *next++ = 0;
+ else if ((next = strchr(p, '\n')) != NULL)
+ *next++ = 0;
+ strcpy(idbuf+off, p);
+ n = malloc(sizeof(*n));
+ if (!n)
+ abort();
+ n->id = strdup(idbuf);
+ n->rate = 0;
+ n->next = db;
+ db = n;
+ p = next;
+ }
+ n = db;
+ if (fgets(buf, sizeof(buf), fp) == NULL)
+ abort();
+ do {
+ p = strrchr(buf, ' ');
+ if (!p)
+ abort();
+ *p = 0;
+ if (sscanf(p+1, "%lu", &n->ival) != 1)
+ abort();
+ n->val = n->ival;
+ /* Trick to skip "dummy" trailing ICMP MIB in 2.4 */
+ if (strcmp(idbuf, "IcmpOutAddrMaskReps") == 0)
+ idbuf[5] = 0;
+ else
+ n = n->next;
+ } while (p > buf + off + 2);
+ }
+
+ while (db) {
+ n = db;
+ db = db->next;
+ if (useless_number(n->id)) {
+ free(n->id);
+ free(n);
+ } else {
+ n->next = kern_db;
+ kern_db = n;
+ }
+ }
+}
+
+void load_snmp(void)
+{
+ FILE *fp = fdopen(net_snmp_open(), "r");
+ if (fp) {
+ load_ugly_table(fp);
+ fclose(fp);
+ }
+}
+
+void load_snmp6(void)
+{
+ FILE *fp = fdopen(net_snmp6_open(), "r");
+ if (fp) {
+ load_good_table(fp);
+ fclose(fp);
+ }
+}
+
+void load_netstat(void)
+{
+ FILE *fp = fdopen(net_netstat_open(), "r");
+ if (fp) {
+ load_ugly_table(fp);
+ fclose(fp);
+ }
+}
+
+void dump_kern_db(FILE *fp, int to_hist)
+{
+ struct nstat_ent *n, *h;
+ h = hist_db;
+ fprintf(fp, "#%s\n", info_source);
+ for (n=kern_db; n; n=n->next) {
+ unsigned long long val = n->val;
+ if (!dump_zeros && !val && !n->rate)
+ continue;
+ if (!match(n->id)) {
+ struct nstat_ent *h1;
+ if (!to_hist)
+ continue;
+ for (h1 = h; h1; h1 = h1->next) {
+ if (strcmp(h1->id, n->id) == 0) {
+ val = h1->val;
+ h = h1->next;
+ break;
+ }
+ }
+ }
+ fprintf(fp, "%-32s%-16llu%6.1f\n", n->id, val, n->rate);
+ }
+}
+
+void dump_incr_db(FILE *fp)
+{
+ struct nstat_ent *n, *h;
+ h = hist_db;
+ fprintf(fp, "#%s\n", info_source);
+ for (n=kern_db; n; n=n->next) {
+ int ovfl = 0;
+ unsigned long long val = n->val;
+ struct nstat_ent *h1;
+ for (h1 = h; h1; h1 = h1->next) {
+ if (strcmp(h1->id, n->id) == 0) {
+ if (val < h1->val) {
+ ovfl = 1;
+ val = h1->val;
+ }
+ val -= h1->val;
+ h = h1->next;
+ break;
+ }
+ }
+ if (!dump_zeros && !val && !n->rate)
+ continue;
+ if (!match(n->id))
+ continue;
+ fprintf(fp, "%-32s%-16llu%6.1f%s\n", n->id, val,
+ n->rate, ovfl?" (overflow)":"");
+ }
+}
+
+static int children;
+
+void sigchild(int signo)
+{
+}
+
+void update_db(int interval)
+{
+ struct nstat_ent *n, *h;
+
+ n = kern_db;
+ kern_db = NULL;
+
+ load_netstat();
+ load_snmp6();
+ load_snmp();
+
+ h = kern_db;
+ kern_db = n;
+
+ for (n = kern_db; n; n = n->next) {
+ struct nstat_ent *h1;
+ for (h1 = h; h1; h1 = h1->next) {
+ if (strcmp(h1->id, n->id) == 0) {
+ double sample;
+ unsigned long incr = h1->ival - n->ival;
+ n->val += incr;
+ n->ival = h1->ival;
+ sample = (double)(incr*1000)/interval;
+ if (interval >= scan_interval) {
+ n->rate += W*(sample-n->rate);
+ } else if (interval >= 1000) {
+ if (interval >= time_constant) {
+ n->rate = sample;
+ } else {
+ double w = W*(double)interval/scan_interval;
+ n->rate += w*(sample-n->rate);
+ }
+ }
+
+ while (h != h1) {
+ struct nstat_ent *tmp = h;
+ h = h->next;
+ free(tmp->id);
+ free(tmp);
+ };
+ h = h1->next;
+ free(h1->id);
+ free(h1);
+ break;
+ }
+ }
+ }
+}
+
+#define T_DIFF(a,b) (((a).tv_sec-(b).tv_sec)*1000 + ((a).tv_usec-(b).tv_usec)/1000)
+
+
+void server_loop(int fd)
+{
+ struct timeval snaptime;
+ struct pollfd p;
+ p.fd = fd;
+ p.events = p.revents = POLLIN;
+
+ sprintf(info_source, "%d.%lu sampling_interval=%d time_const=%d",
+ getpid(), (unsigned long)random(), scan_interval/1000, time_constant/1000);
+
+ load_netstat();
+ load_snmp6();
+ load_snmp();
+
+ for (;;) {
+ int status;
+ int tdiff;
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ tdiff = T_DIFF(now, snaptime);
+ if (tdiff >= scan_interval) {
+ update_db(tdiff);
+ snaptime = now;
+ tdiff = 0;
+ }
+ if (poll(&p, 1, tdiff + scan_interval) > 0
+ && (p.revents&POLLIN)) {
+ int clnt = accept(fd, NULL, NULL);
+ if (clnt >= 0) {
+ pid_t pid;
+ if (children >= 5) {
+ close(clnt);
+ } else if ((pid = fork()) != 0) {
+ if (pid>0)
+ children++;
+ close(clnt);
+ } else {
+ FILE *fp = fdopen(clnt, "w");
+ if (fp) {
+ if (tdiff > 0)
+ update_db(tdiff);
+ dump_kern_db(fp, 0);
+ }
+ exit(0);
+ }
+ }
+ }
+ while (children && waitpid(-1, &status, WNOHANG) > 0)
+ children--;
+ }
+}
+
+int verify_forging(int fd)
+{
+ struct ucred cred;
+ int olen = sizeof(cred);
+ if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, (void*)&cred, &olen) ||
+ olen < sizeof(cred))
+ return -1;
+ if (cred.uid == getuid() || cred.uid == 0)
+ return 0;
+ return -1;
+}
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr,
+"Usage: nstat [ -h?vVzrnasd:t: ] [ PATTERN [ PATTERN ] ]\n"
+ );
+ exit(-1);
+}
+
+
+int main(int argc, char *argv[])
+{
+ char hist_name[128];
+ struct sockaddr_un sun;
+ FILE *hist_fp = NULL;
+ int ch;
+ int fd;
+
+ while ((ch = getopt(argc, argv, "h?vVzrnasd:t:")) != EOF) {
+ switch(ch) {
+ case 'z':
+ dump_zeros = 1;
+ break;
+ case 'r':
+ reset_history = 1;
+ break;
+ case 'a':
+ ignore_history = 1;
+ break;
+ case 's':
+ no_update = 1;
+ break;
+ case 'n':
+ no_output = 1;
+ break;
+ case 'd':
+ scan_interval = 1000*atoi(optarg);
+ break;
+ case 't':
+ if (sscanf(optarg, "%d", &time_constant) != 1 ||
+ time_constant <= 0) {
+ fprintf(stderr, "nstat: invalid time constant divisor\n");
+ exit(-1);
+ }
+ break;
+ case 'v':
+ case 'V':
+ printf("nstat utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ case 'h':
+ case '?':
+ default:
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ sun.sun_family = AF_UNIX;
+ sun.sun_path[0] = 0;
+ sprintf(sun.sun_path+1, "nstat%d", getuid());
+
+ if (scan_interval > 0) {
+ if (time_constant == 0)
+ time_constant = 60;
+ time_constant *= 1000;
+ W = 1 - 1/exp(log(10)*(double)scan_interval/time_constant);
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("nstat: socket");
+ exit(-1);
+ }
+ if (bind(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) < 0) {
+ perror("nstat: bind");
+ exit(-1);
+ }
+ if (listen(fd, 5) < 0) {
+ perror("nstat: listen");
+ exit(-1);
+ }
+ if (fork())
+ exit(0);
+ chdir("/");
+ close(0); close(1); close(2); setsid();
+ signal(SIGPIPE, SIG_IGN);
+ signal(SIGCHLD, sigchild);
+ server_loop(fd);
+ exit(0);
+ }
+
+ patterns = argv;
+ npatterns = argc;
+
+ if (getenv("NSTAT_HISTORY"))
+ snprintf(hist_name, sizeof(hist_name), getenv("NSTAT_HISTORY"));
+ else
+ sprintf(hist_name, "/tmp/.nstat.u%d", getuid());
+
+ if (reset_history)
+ unlink(hist_name);
+
+ if (!ignore_history || !no_update) {
+ struct stat stb;
+
+ fd = open(hist_name, O_RDWR|O_CREAT|O_NOFOLLOW, 0600);
+ if (fd < 0) {
+ perror("nstat: open history file");
+ exit(-1);
+ }
+ if ((hist_fp = fdopen(fd, "r+")) == NULL) {
+ perror("nstat: fdopen history file");
+ exit(-1);
+ }
+ if (flock(fileno(hist_fp), LOCK_EX)) {
+ perror("nstat: flock history file");
+ exit(-1);
+ }
+ if (fstat(fileno(hist_fp), &stb) != 0) {
+ perror("nstat: fstat history file");
+ exit(-1);
+ }
+ if (stb.st_nlink != 1 || stb.st_uid != getuid()) {
+ fprintf(stderr, "nstat: something is so wrong with history file, that I prefer not to proceed.\n");
+ exit(-1);
+ }
+ if (!ignore_history) {
+ FILE *tfp;
+ long uptime;
+ if ((tfp = fopen("/proc/uptime", "r")) != NULL) {
+ if (fscanf(tfp, "%ld", &uptime) != 1)
+ uptime = -1;
+ fclose(tfp);
+ }
+ if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) {
+ fprintf(stderr, "nstat: history is aged out, resetting\n");
+ ftruncate(fileno(hist_fp), 0);
+ }
+ }
+
+ load_good_table(hist_fp);
+
+ hist_db = kern_db;
+ kern_db = NULL;
+ }
+
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) >= 0 &&
+ (connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0
+ || (strcpy(sun.sun_path+1, "nstat0"),
+ connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0))
+ && verify_forging(fd) == 0) {
+ FILE *sfp = fdopen(fd, "r");
+ load_good_table(sfp);
+ if (hist_db && source_mismatch) {
+ fprintf(stderr, "nstat: history is stale, ignoring it.\n");
+ hist_db = NULL;
+ }
+ fclose(sfp);
+ } else {
+ if (fd >= 0)
+ close(fd);
+ if (hist_db && info_source[0] && strcmp(info_source, "kernel")) {
+ fprintf(stderr, "nstat: history is stale, ignoring it.\n");
+ hist_db = NULL;
+ info_source[0] = 0;
+ }
+ load_netstat();
+ load_snmp6();
+ load_snmp();
+ if (info_source[0] == 0)
+ strcpy(info_source, "kernel");
+ }
+
+ if (!no_output) {
+ if (ignore_history || hist_db == NULL)
+ dump_kern_db(stdout, 0);
+ else
+ dump_incr_db(stdout);
+ }
+ if (!no_update) {
+ ftruncate(fileno(hist_fp), 0);
+ rewind(hist_fp);
+ dump_kern_db(hist_fp, 1);
+ fflush(hist_fp);
+ }
+ exit(0);
+}
diff --git a/misc/rtacct.c b/misc/rtacct.c
index e69de29b..5c6748b9 100644
--- a/misc/rtacct.c
+++ b/misc/rtacct.c
@@ -0,0 +1,625 @@
+/*
+ * rtacct.c Applet to display contents of /proc/net/rt_acct.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+#include <fnmatch.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <math.h>
+
+#include "rt_names.h"
+
+#include <SNAPSHOT.h>
+
+int reset_history = 0;
+int ignore_history = 0;
+int no_output = 0;
+int no_update = 0;
+int scan_interval = 0;
+int time_constant = 0;
+int dump_zeros = 0;
+unsigned long magic_number = 0;
+double W;
+
+int generic_proc_open(char *env, char *name)
+{
+ char store[1024];
+ char *p = getenv(env);
+ if (!p) {
+ p = getenv("PROC_ROOT") ? : "/proc";
+ snprintf(store, sizeof(store)-1, "%s/%s", p, name);
+ p = store;
+ }
+ return open(store, O_RDONLY);
+}
+
+int net_rtacct_open(void)
+{
+ return generic_proc_open("PROC_NET_RTACCT", "net/rt_acct");
+}
+
+__u32 rmap[256/4];
+
+struct rtacct_data
+{
+ __u32 ival[256*4];
+
+ unsigned long long val[256*4];
+ double rate[256*4];
+ __u8 signature[128];
+};
+
+struct rtacct_data kern_db_static;
+
+struct rtacct_data *kern_db = &kern_db_static;
+struct rtacct_data *hist_db;
+
+void nread(int fd, char *buf, int tot)
+{
+ int count = 0;
+
+ while (count < tot) {
+ int n = read(fd, buf+count, tot-count);
+ if (n < 0) {
+ if (errno == EINTR)
+ continue;
+ exit(-1);
+ }
+ if (n == 0)
+ exit(-1);
+ count += n;
+ }
+}
+
+
+__u32 *read_kern_table(__u32 *tbl)
+{
+ static __u32 *tbl_ptr;
+ int fd;
+
+ if (magic_number) {
+ if (tbl_ptr != NULL)
+ return tbl_ptr;
+
+ fd = open("/dev/mem", O_RDONLY);
+ if (fd < 0) {
+ perror("magic open");
+ exit(-1);
+ }
+ tbl_ptr = mmap(NULL, 4096,
+ PROT_READ,
+ MAP_SHARED,
+ fd, magic_number);
+ if ((unsigned long)tbl_ptr == ~0UL) {
+ perror("magic mmap");
+ exit(-1);
+ }
+ close(fd);
+ return tbl_ptr;
+ }
+
+ fd = net_rtacct_open();
+ if (fd >= 0) {
+ nread(fd, (char*)tbl, 256*16);
+ close(fd);
+ } else {
+ memset(tbl, 0, 256*16);
+ }
+ return tbl;
+}
+
+void format_rate(FILE *fp, double rate)
+{
+ char temp[64];
+
+ if (rate > 1024*1024) {
+ sprintf(temp, "%uM", (unsigned)rint(rate/(1024*1024)));
+ fprintf(fp, " %-10s", temp);
+ } else if (rate > 1024) {
+ sprintf(temp, "%uK", (unsigned)rint(rate/1024));
+ fprintf(fp, " %-10s", temp);
+ } else
+ fprintf(fp, " %-10u", (unsigned)rate);
+}
+
+void format_count(FILE *fp, unsigned long long val)
+{
+ if (val > 1024*1024*1024)
+ fprintf(fp, " %10lluM", val/(1024*1024));
+ else if (val > 1024*1024)
+ fprintf(fp, " %10lluK", val/1024);
+ else
+ fprintf(fp, " %10llu", val);
+}
+
+void dump_abs_db(FILE *fp)
+{
+ int realm;
+ char b1[16];
+
+ if (!no_output) {
+ fprintf(fp, "#%s\n", kern_db->signature);
+ fprintf(fp,
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"\n"
+ , "Realm", "BytesTo", "PktsTo", "BytesFrom", "PktsFrom");
+ fprintf(fp,
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"\n"
+ , "", "BPSTo", "PPSTo", "BPSFrom", "PPSFrom");
+
+ }
+
+ for (realm=0; realm<256; realm++) {
+ int i;
+ unsigned long long *val;
+ double *rate;
+
+ if (!(rmap[realm>>5] & (1<<(realm&0x1f))))
+ continue;
+
+ val = &kern_db->val[realm*4];
+ rate = &kern_db->rate[realm*4];
+
+ if (!dump_zeros &&
+ !val[0] && !rate[0] &&
+ !val[1] && !rate[1] &&
+ !val[2] && !rate[2] &&
+ !val[3] && !rate[3])
+ continue;
+
+ if (hist_db) {
+ memcpy(&hist_db->val[realm*4], val, sizeof(*val)*4);
+ }
+
+ if (no_output)
+ continue;
+
+ fprintf(fp, "%-10s", rtnl_rtrealm_n2a(realm, b1, sizeof(b1)));
+ for (i = 0; i < 4; i++)
+ format_count(fp, val[i]);
+ fprintf(fp, "\n%-10s", "");
+ for (i = 0; i < 4; i++)
+ format_rate(fp, rate[i]);
+ fprintf(fp, "\n");
+ }
+}
+
+
+void dump_incr_db(FILE *fp)
+{
+ int k, realm;
+ char b1[16];
+
+ if (!no_output) {
+ fprintf(fp, "#%s\n", kern_db->signature);
+ fprintf(fp,
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"\n"
+ , "Realm", "BytesTo", "PktsTo", "BytesFrom", "PktsFrom");
+ fprintf(fp,
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"%-10s "
+"\n"
+ , "", "BPSTo", "PPSTo", "BPSFrom", "PPSFrom");
+ }
+
+ for (realm=0; realm<256; realm++) {
+ int ovfl = 0;
+ int i;
+ unsigned long long *val;
+ double *rate;
+ unsigned long long rval[4];
+
+ if (!(rmap[realm>>5] & (1<<(realm&0x1f))))
+ continue;
+
+ val = &kern_db->val[realm*4];
+ rate = &kern_db->rate[realm*4];
+
+ for (k=0; k<4; k++) {
+ rval[k] = val[k];
+ if (rval[k] < hist_db->val[realm*4+k])
+ ovfl = 1;
+ else
+ rval[k] -= hist_db->val[realm*4+k];
+ }
+ if (ovfl) {
+ for (k=0; k<4; k++)
+ rval[k] = val[k];
+ }
+ if (hist_db) {
+ memcpy(&hist_db->val[realm*4], val, sizeof(*val)*4);
+ }
+
+ if (no_output)
+ continue;
+
+ if (!dump_zeros &&
+ !rval[0] && !rate[0] &&
+ !rval[1] && !rate[1] &&
+ !rval[2] && !rate[2] &&
+ !rval[3] && !rate[3])
+ continue;
+
+
+ fprintf(fp, "%-10s", rtnl_rtrealm_n2a(realm, b1, sizeof(b1)));
+ for (i = 0; i < 4; i++)
+ format_count(fp, rval[i]);
+ fprintf(fp, "\n%-10s", "");
+ for (i = 0; i < 4; i++)
+ format_rate(fp, rate[i]);
+ fprintf(fp, "\n");
+ }
+}
+
+
+static int children;
+
+void sigchild(int signo)
+{
+}
+
+/* Server side only: read kernel data, update tables, calculate rates. */
+
+void update_db(int interval)
+{
+ int i;
+ __u32 *ival;
+ __u32 _ival[256*4];
+
+ ival = read_kern_table(_ival);
+
+ for (i=0; i<256*4; i++) {
+ double sample;
+ __u32 incr = ival[i] - kern_db->ival[i];
+
+ if (ival[i] == 0 && incr == 0 &&
+ kern_db->val[i] == 0 && kern_db->rate[i] == 0)
+ continue;
+
+ kern_db->val[i] += incr;
+ kern_db->ival[i] = ival[i];
+ sample = (double)(incr*1000)/interval;
+ if (interval >= scan_interval) {
+ kern_db->rate[i] += W*(sample-kern_db->rate[i]);
+ } else if (interval >= 1000) {
+ if (interval >= time_constant) {
+ kern_db->rate[i] = sample;
+ } else {
+ double w = W*(double)interval/scan_interval;
+ kern_db->rate[i] += w*(sample-kern_db->rate[i]);
+ }
+ }
+ }
+}
+
+void send_db(int fd)
+{
+ int tot = 0;
+
+ while (tot < sizeof(*kern_db)) {
+ int n = write(fd, ((char*)kern_db) + tot, sizeof(*kern_db)-tot);
+ if (n < 0) {
+ if (errno == EINTR)
+ continue;
+ return;
+ }
+ tot += n;
+ }
+}
+
+
+
+#define T_DIFF(a,b) (((a).tv_sec-(b).tv_sec)*1000 + ((a).tv_usec-(b).tv_usec)/1000)
+
+
+void pad_kern_table(struct rtacct_data *dat, __u32 *ival)
+{
+ int i;
+ memset(dat->rate, 0, sizeof(dat->rate));
+ if (dat->ival != ival)
+ memcpy(dat->ival, ival, sizeof(dat->ival));
+ for (i=0; i<256*4; i++)
+ dat->val[i] = ival[i];
+}
+
+void server_loop(int fd)
+{
+ struct timeval snaptime;
+ struct pollfd p;
+ p.fd = fd;
+ p.events = p.revents = POLLIN;
+
+ sprintf(kern_db->signature, "%d.%lu sampling_interval=%d time_const=%d",
+ getpid(), (unsigned long)random(), scan_interval/1000, time_constant/1000);
+
+ pad_kern_table(kern_db, read_kern_table(kern_db->ival));
+
+ for (;;) {
+ int status;
+ int tdiff;
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ tdiff = T_DIFF(now, snaptime);
+ if (tdiff >= scan_interval) {
+ update_db(tdiff);
+ snaptime = now;
+ tdiff = 0;
+ }
+ if (poll(&p, 1, tdiff + scan_interval) > 0
+ && (p.revents&POLLIN)) {
+ int clnt = accept(fd, NULL, NULL);
+ if (clnt >= 0) {
+ pid_t pid;
+ if (children >= 5) {
+ close(clnt);
+ } else if ((pid = fork()) != 0) {
+ if (pid>0)
+ children++;
+ close(clnt);
+ } else {
+ if (tdiff > 0)
+ update_db(tdiff);
+ send_db(clnt);
+ exit(0);
+ }
+ }
+ }
+ while (children && waitpid(-1, &status, WNOHANG) > 0)
+ children--;
+ }
+}
+
+int verify_forging(int fd)
+{
+ struct ucred cred;
+ int olen = sizeof(cred);
+ if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, (void*)&cred, &olen) ||
+ olen < sizeof(cred))
+ return -1;
+ if (cred.uid == getuid() || cred.uid == 0)
+ return 0;
+ return -1;
+}
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr,
+"Usage: rtacct [ -h?vVzrnasd:t: ] [ ListOfRealms ]\n"
+ );
+ exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+ char hist_name[128];
+ struct sockaddr_un sun;
+ int ch;
+ int fd;
+
+ while ((ch = getopt(argc, argv, "h?vVzrM:nasd:t:")) != EOF) {
+ switch(ch) {
+ case 'z':
+ dump_zeros = 1;
+ break;
+ case 'r':
+ reset_history = 1;
+ break;
+ case 'a':
+ ignore_history = 1;
+ break;
+ case 's':
+ no_update = 1;
+ break;
+ case 'n':
+ no_output = 1;
+ break;
+ case 'd':
+ scan_interval = 1000*atoi(optarg);
+ break;
+ case 't':
+ if (sscanf(optarg, "%d", &time_constant) != 1 ||
+ time_constant <= 0) {
+ fprintf(stderr, "rtacct: invalid time constant divisor\n");
+ exit(-1);
+ }
+ break;
+ case 'v':
+ case 'V':
+ printf("rtacct utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ case 'M':
+ /* Some secret undocumented option, nobody
+ * is expected to ask about its sense. See?
+ */
+ sscanf(optarg, "%lx", &magic_number);
+ break;
+ case 'h':
+ case '?':
+ default:
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc) {
+ while (argc > 0) {
+ __u32 realm;
+ if (rtnl_rtrealm_a2n(&realm, argv[0])) {
+ fprintf(stderr, "Warning: realm \"%s\" does not exist.\n", argv[0]);
+ exit(-1);
+ }
+ rmap[realm>>5] |= (1<<(realm&0x1f));
+ argc--; argv++;
+ }
+ } else {
+ memset(rmap, ~0, sizeof(rmap));
+ /* Always suppress zeros. */
+ dump_zeros = 0;
+ }
+
+ sun.sun_family = AF_UNIX;
+ sun.sun_path[0] = 0;
+ sprintf(sun.sun_path+1, "rtacct%d", getuid());
+
+ if (scan_interval > 0) {
+ if (time_constant == 0)
+ time_constant = 60;
+ time_constant *= 1000;
+ W = 1 - 1/exp(log(10)*(double)scan_interval/time_constant);
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("rtacct: socket");
+ exit(-1);
+ }
+ if (bind(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) < 0) {
+ perror("rtacct: bind");
+ exit(-1);
+ }
+ if (listen(fd, 5) < 0) {
+ perror("rtacct: listen");
+ exit(-1);
+ }
+ if (fork())
+ exit(0);
+ chdir("/");
+ close(0); close(1); close(2); setsid();
+ signal(SIGPIPE, SIG_IGN);
+ signal(SIGCHLD, sigchild);
+ server_loop(fd);
+ exit(0);
+ }
+
+ if (getenv("RTACCT_HISTORY"))
+ snprintf(hist_name, sizeof(hist_name), getenv("RTACCT_HISTORY"));
+ else
+ sprintf(hist_name, "/tmp/.rtacct.u%d", getuid());
+
+ if (reset_history)
+ unlink(hist_name);
+
+ if (!ignore_history || !no_update) {
+ struct stat stb;
+
+ fd = open(hist_name, O_RDWR|O_CREAT|O_NOFOLLOW, 0600);
+ if (fd < 0) {
+ perror("rtacct: open history file");
+ exit(-1);
+ }
+ if (flock(fd, LOCK_EX)) {
+ perror("rtacct: flock history file");
+ exit(-1);
+ }
+ if (fstat(fd, &stb) != 0) {
+ perror("rtacct: fstat history file");
+ exit(-1);
+ }
+ if (stb.st_nlink != 1 || stb.st_uid != getuid()) {
+ fprintf(stderr, "rtacct: something is so wrong with history file, that I prefer not to proceed.\n");
+ exit(-1);
+ }
+ if (stb.st_size != sizeof(*hist_db))
+ write(fd, kern_db, sizeof(*hist_db));
+
+ hist_db = mmap(NULL, sizeof(*hist_db),
+ PROT_READ|PROT_WRITE,
+ no_update ? MAP_PRIVATE : MAP_SHARED,
+ fd, 0);
+
+ if ((unsigned long)hist_db == ~0UL) {
+ perror("mmap");
+ exit(-1);
+ }
+
+ if (!ignore_history) {
+ FILE *tfp;
+ long uptime;
+ if ((tfp = fopen("/proc/uptime", "r")) != NULL) {
+ if (fscanf(tfp, "%ld", &uptime) != 1)
+ uptime = -1;
+ fclose(tfp);
+ }
+
+ if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) {
+ fprintf(stderr, "rtacct: history is aged out, resetting\n");
+ memset(hist_db, 0, sizeof(*hist_db));
+ }
+ }
+
+ close(fd);
+ }
+
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) >= 0 &&
+ (connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0
+ || (strcpy(sun.sun_path+1, "rtacct0"),
+ connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0))
+ && verify_forging(fd) == 0) {
+ nread(fd, (char*)kern_db, sizeof(*kern_db));
+ if (hist_db && hist_db->signature[0] &&
+ strcmp(kern_db->signature, hist_db->signature)) {
+ fprintf(stderr, "rtacct: history is stale, ignoring it.\n");
+ hist_db = NULL;
+ }
+ close(fd);
+ } else {
+ if (fd >= 0)
+ close(fd);
+
+ if (hist_db && hist_db->signature[0] &&
+ strcmp(hist_db->signature, "kernel")) {
+ fprintf(stderr, "rtacct: history is stale, ignoring it.\n");
+ hist_db = NULL;
+ }
+
+ pad_kern_table(kern_db, read_kern_table(kern_db->ival));
+ strcpy(kern_db->signature, "kernel");
+ }
+
+ if (ignore_history || hist_db == NULL)
+ dump_abs_db(stdout);
+ else
+ dump_incr_db(stdout);
+
+ exit(0);
+}
diff --git a/misc/rtstat.c b/misc/rtstat.c
index e69de29b..feed6cf2 100644
--- a/misc/rtstat.c
+++ b/misc/rtstat.c
@@ -0,0 +1,172 @@
+/* rtstat.c: A program for route cache monitoring
+ *
+ * Copyright 2001 by Robert Olsson <robert.olsson@its.uu.se>
+ * Uppsala University, Sweden
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Additional credits:
+ * Martin Josefsson <gandalf@wlug.westbo.se> 010828 bug fix
+ *
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#define VERSION "0.33 010829"
+
+extern char *optarg;
+extern int optind, opterr, optopt;
+
+FILE *fp;
+unsigned rt_size, in_hit[2], in_slow_tot[2], in_slow_mc[2],
+ in_no_rt[2], in_brd[2], in_martian_dst[2], in_martian_src[2],
+ out_hit[2], out_slow_tot[2], out_slow_mc[2];
+
+
+/* Read (and summarize for SMP) the different stats vars. */
+
+void scan_line(int i)
+{
+ unsigned temp[10];
+
+ in_hit[i] = 0;
+ in_slow_tot[i] = 0;
+ in_slow_mc[i] = 0;
+ in_no_rt[i] = 0;
+ in_brd[i] = 0;
+ in_martian_dst[i] = 0;
+ in_martian_src[i] = 0;
+ out_hit[i] = 0;
+ out_slow_tot[i] = 0;
+ out_slow_mc[i] = 0;
+
+ while(!feof(fp)) {
+ fscanf(fp, "%x %x %x %x %x %x %x %x %x %x %x\n",
+ &rt_size,
+ &temp[0], /* in_hit */
+ &temp[1], /* in_slow_tot */
+ &temp[2], /* in_slow_mc */
+ &temp[3], /* in_no_rt */
+ &temp[4], /* in_brd */
+ &temp[5], /* in_martian_dst */
+ &temp[6], /* in_martian_src */
+ &temp[7], /* out_hit */
+ &temp[8], /* out_slow_tot */
+ &temp[9] /* out_slow_mc */
+ );
+
+ in_hit[i] += temp[0];
+ in_slow_tot[i] += temp[1];
+ in_slow_mc[i] += temp[2];
+ in_no_rt[i] += temp[3];
+ in_brd[i] += temp[4];
+ in_martian_dst[i] += temp[5];
+ in_martian_src[i] += temp[6];
+ out_hit[i] += temp[7];
+ out_slow_tot[i] += temp[8];
+ out_slow_mc[i] += temp[9];
+ }
+ return;
+}
+
+void print_hdr_line(void)
+{
+ printf(" size IN: hit tot mc no_rt bcast madst masrc OUT: hit tot mc\n");
+}
+
+int usage(int exit_code)
+{
+ fprintf(stderr, "rtstat Version %s\n", VERSION);
+ fprintf(stderr, " -help\n");
+ fprintf(stderr, " -i interval\n");
+ fprintf(stderr, " -s subject [0-2]\n");
+ fprintf(stderr, "\n");
+ print_hdr_line();
+ fprintf(stderr, "\n");
+ fprintf(stderr, "size == route cache size\n");
+ fprintf(stderr, "hit == IN: total number of cache hits per sec\n");
+ fprintf(stderr, "tot == IN: total number of cache misses per sec\n");
+ fprintf(stderr, "mc == IN: mulicast cache misses per sec\n");
+ fprintf(stderr, "no_rt == IN: route table misses per sec\n");
+ fprintf(stderr, "bcast == IN: broadcast cache misses per sec\n");
+ fprintf(stderr, "madst == IN: dst martians per sec\n");
+ fprintf(stderr, "masrc == IN: src martians per sec\n");
+
+ fprintf(stderr, "hit == OUT: total number of cache hits per sec\n");
+ fprintf(stderr, "tot == OUT: total number of cache misses per sec\n");
+ fprintf(stderr, "mc == OUT: mulicast cache misses per sec\n");
+
+ exit(exit_code);
+}
+
+int main(int argc, char **argv)
+{
+ int c, i=1, interval=2, hdr=2;
+
+ while ((c=getopt(argc, argv,"h?s:i:")) != EOF)
+ switch (c)
+ {
+
+ case '?':
+ case 'h': usage(0);
+
+ case 'i': sscanf(optarg, "%u", &interval);
+ break;
+
+ case 's': sscanf(optarg, "%u", &hdr);
+ break;
+
+ default: usage(1);
+ }
+
+ if(interval < 1 ) interval=1;
+
+ if ((fp = fopen("/proc/net/rt_cache_stat", "r")));
+ else
+ {
+ perror("fopen");
+ exit(-1);
+ }
+
+ if(hdr > 0) print_hdr_line();
+
+ for(;1;i++) {
+
+ if(hdr > 1 && (! (i % 20))) print_hdr_line();
+
+ scan_line(0);
+ sleep(interval);
+ rewind(fp);
+ scan_line(1);
+ rewind(fp);
+
+ printf("%5u %9u %7u %5u %5u %5u %5u %5u %9u %7u %6u\n",
+ rt_size,
+ (in_hit[1] - in_hit[0])/interval,
+ (in_slow_tot[1] - in_slow_tot[0])/interval,
+ (in_slow_mc[1] - in_slow_mc[0])/interval,
+ (in_no_rt[1] - in_no_rt[0])/interval,
+ (in_brd[1] - in_brd[0])/interval,
+ (in_martian_dst[1] - in_martian_dst[0])/interval,
+ (in_martian_src[1] - in_martian_src[0])/interval,
+
+ (out_hit[1] - out_hit[0])/interval,
+ (out_slow_tot[1] - out_slow_tot[0])/interval,
+ (out_slow_mc[1] - out_slow_mc[0])/interval
+ );
+ }
+ return 1;
+}
+
+/*
+ * Compile:
+ gcc -g -O2 -Wall -o rtstat rtstat.c
+*/
+
+
+
diff --git a/misc/ss.c b/misc/ss.c
index e69de29b..3918bdef 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -0,0 +1,2672 @@
+/*
+ * ss.c "sockstat", socket statistics
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <netinet/in.h>
+#include <string.h>
+#include <errno.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <resolv.h>
+#include <dirent.h>
+#include <fnmatch.h>
+
+#include "utils.h"
+#include "rt_names.h"
+#include "ll_map.h"
+#include "libnetlink.h"
+#include "tcp_diag.h"
+#include "SNAPSHOT.h"
+
+#include <linux/tcp.h>
+
+int resolve_hosts = 0;
+int resolve_services = 1;
+int preferred_family = AF_UNSPEC;
+int show_options = 0;
+int show_details = 0;
+int show_users = 0;
+int show_mem = 0;
+int show_tcpinfo = 0;
+
+int netid_width;
+int state_width;
+int addrp_width;
+int addr_width;
+int serv_width;
+int screen_width;
+
+static const char *TCP_PROTO = "tcp";
+static const char *UDP_PROTO = "udp";
+static const char *RAW_PROTO = "raw";
+static const char *dg_proto = NULL;
+
+enum
+{
+ TCP_DB,
+ UDP_DB,
+ RAW_DB,
+ UNIX_DG_DB,
+ UNIX_ST_DB,
+ PACKET_DG_DB,
+ PACKET_R_DB,
+ NETLINK_DB,
+ MAX_DB
+};
+
+#define PACKET_DBM ((1<<PACKET_DG_DB)|(1<<PACKET_R_DB))
+#define UNIX_DBM ((1<<UNIX_DG_DB)|(1<<UNIX_ST_DB))
+#define ALL_DB ((1<<MAX_DB)-1)
+
+enum {
+ SS_UNKNOWN,
+ SS_ESTABLISHED,
+ SS_SYN_SENT,
+ SS_SYN_RECV,
+ SS_FIN_WAIT1,
+ SS_FIN_WAIT2,
+ SS_TIME_WAIT,
+ SS_CLOSE,
+ SS_CLOSE_WAIT,
+ SS_LAST_ACK,
+ SS_LISTEN,
+ SS_CLOSING,
+ SS_MAX
+};
+
+#define SS_ALL ((1<<SS_MAX)-1)
+
+#include "ssfilter.h"
+
+struct filter
+{
+ int dbs;
+ int states;
+ int families;
+ struct ssfilter *f;
+};
+
+struct filter default_filter = {
+ dbs: (1<<TCP_DB),
+ states: SS_ALL & ~((1<<SS_LISTEN)|(1<<SS_CLOSE)|(1<<SS_TIME_WAIT)|(1<<SS_SYN_RECV)),
+ families: (1<<AF_INET)|(1<<AF_INET6),
+};
+
+struct filter current_filter;
+
+int generic_proc_open(char *env, char *name)
+{
+ char store[128];
+ char *p = getenv(env);
+ if (!p) {
+ p = getenv("PROC_ROOT") ? : "/proc";
+ snprintf(store, sizeof(store)-1, "%s/%s", p, name);
+ p = store;
+ }
+ return open(store, O_RDONLY);
+}
+
+int net_tcp_open(void)
+{
+ return generic_proc_open("PROC_NET_TCP", "net/tcp");
+}
+
+int net_tcp6_open(void)
+{
+ return generic_proc_open("PROC_NET_TCP6", "net/tcp6");
+}
+
+int net_udp_open(void)
+{
+ return generic_proc_open("PROC_NET_UDP", "net/udp");
+}
+
+int net_udp6_open(void)
+{
+ return generic_proc_open("PROC_NET_UDP6", "net/udp6");
+}
+
+int net_raw_open(void)
+{
+ return generic_proc_open("PROC_NET_RAW", "net/raw");
+}
+
+int net_raw6_open(void)
+{
+ return generic_proc_open("PROC_NET_RAW6", "net/raw6");
+}
+
+int net_unix_open(void)
+{
+ return generic_proc_open("PROC_NET_UNIX", "net/unix");
+}
+
+int net_packet_open(void)
+{
+ return generic_proc_open("PROC_NET_PACKET", "net/packet");
+}
+
+int net_netlink_open(void)
+{
+ return generic_proc_open("PROC_NET_NETLINK", "net/netlink");
+}
+
+int slabinfo_open(void)
+{
+ return generic_proc_open("PROC_SLABINFO", "slabinfo");
+}
+
+int net_sockstat_open(void)
+{
+ return generic_proc_open("PROC_NET_SOCKSTAT", "net/sockstat");
+}
+
+int net_sockstat6_open(void)
+{
+ return generic_proc_open("PROC_NET_SOCKSTAT6", "net/sockstat6");
+}
+
+int net_snmp_open(void)
+{
+ return generic_proc_open("PROC_NET_SNMP", "net/snmp");
+}
+
+int net_netstat_open(void)
+{
+ return generic_proc_open("PROC_NET_NETSTAT", "net/netstat");
+}
+
+int ephemeral_ports_open(void)
+{
+ return generic_proc_open("PROC_IP_LOCAL_PORT_RANGE", "sys/net/ipv4/ip_local_port_range");
+}
+
+int find_users(int ino, char *buf, int buflen)
+{
+ char pattern[64];
+ int pattern_len;
+ char *ptr = buf;
+ char name[1024];
+ DIR *dir;
+ struct dirent *d;
+ int cnt = 0;
+ int nameoff;
+
+ if (!ino)
+ return 0;
+
+ sprintf(pattern, "socket:[%d]", ino);
+ pattern_len = strlen(pattern);
+
+ strncpy(name, getenv("PROC_ROOT") ? : "/proc/", sizeof(name)/2);
+ name[sizeof(name)/2] = 0;
+ if (strlen(name) == 0 ||
+ name[strlen(name)-1] != '/')
+ strcat(name, "/");
+ nameoff = strlen(name);
+ if ((dir = opendir(name)) == NULL)
+ return 0;
+
+ while ((d = readdir(dir)) != NULL) {
+ DIR *dir1;
+ struct dirent *d1;
+ int pid;
+ int pos;
+ char crap;
+ char process[16];
+
+ if (sscanf(d->d_name, "%d%c", &pid, &crap) != 1)
+ continue;
+
+ sprintf(name+nameoff, "%d/fd/", pid);
+ pos = strlen(name);
+ if ((dir1 = opendir(name)) == NULL)
+ continue;
+
+ process[0] = 0;
+
+ while ((d1 = readdir(dir1)) != NULL) {
+ int fd, n;
+ char lnk[64];
+
+ if (sscanf(d1->d_name, "%d%c", &fd, &crap) != 1)
+ continue;
+
+ sprintf(name+pos, "%d", fd);
+ n = readlink(name, lnk, sizeof(lnk)-1);
+ if (n != pattern_len ||
+ memcmp(lnk, pattern, n))
+ continue;
+
+ if (ptr-buf >= buflen-1)
+ break;
+
+ if (process[0] == 0) {
+ char tmp[1024];
+ FILE *fp;
+ snprintf(tmp, sizeof(tmp), "%s/%d/stat",
+ getenv("PROC_ROOT") ? : "/proc", pid);
+ if ((fp = fopen(tmp, "r")) != NULL) {
+ fscanf(fp, "%*d (%[^)])", process);
+ fclose(fp);
+ }
+ }
+
+ snprintf(ptr, buflen-(ptr-buf), "(\"%s\",%d,%d),", process, pid, fd);
+ ptr += strlen(ptr);
+ cnt++;
+ }
+ closedir(dir1);
+ }
+ closedir(dir);
+ if (ptr != buf)
+ ptr[-1] = 0;
+ return cnt;
+}
+
+
+/* Get stats from slab */
+
+struct slabstat
+{
+ int socks;
+ int tcp_ports;
+ int tcp_tws;
+ int tcp_syns;
+ int skbs;
+};
+
+struct slabstat slabstat;
+
+const char *slabstat_ids[] =
+{
+ "sock",
+ "tcp_bind_bucket",
+ "tcp_tw_bucket",
+ "tcp_open_request",
+ "skbuff_head_cache",
+};
+
+int get_slabstat(struct slabstat *s)
+{
+ char buf[256];
+ FILE *fp;
+ int cnt;
+
+ memset(s, 0, sizeof(*s));
+
+ if ((fp = fdopen(slabinfo_open(), "r")) == NULL)
+ return -1;
+
+ cnt = sizeof(*s)/sizeof(int);
+
+ fgets(buf, sizeof(buf), fp);
+ while(fgets(buf, sizeof(buf), fp) != NULL) {
+ int i;
+ for (i=0; i<sizeof(slabstat_ids)/sizeof(slabstat_ids[0]); i++) {
+ if (memcmp(buf, slabstat_ids[i], strlen(slabstat_ids[i])) == 0) {
+ sscanf(buf, "%*s%d", ((int *)s) + i);
+ cnt--;
+ break;
+ }
+ }
+ if (cnt <= 0)
+ break;
+ }
+
+ fclose(fp);
+ return 0;
+}
+
+
+
+
+char *sstate_name[] = {
+ "UNKNOWN",
+ "ESTAB",
+ "SYN-SENT",
+ "SYN-RECV",
+ "FIN-WAIT-1",
+ "FIN-WAIT-2",
+ "TIME-WAIT",
+ "UNCONN",
+ "CLOSE-WAIT",
+ "LAST-ACK",
+ "LISTEN",
+ "CLOSING",
+};
+
+char *sstate_namel[] = {
+ "UNKNOWN",
+ "established",
+ "syn-sent",
+ "syn-recv",
+ "fin-wait-1",
+ "fin-wait-2",
+ "time-wait",
+ "unconnected",
+ "close-wait",
+ "last-ack",
+ "listening",
+ "closing",
+};
+
+struct tcpstat
+{
+ inet_prefix local;
+ inet_prefix remote;
+ int lport;
+ int rport;
+ int state;
+ int rq, wq;
+ int timer;
+ int timeout;
+ int retrs;
+ int ino;
+ int probes;
+ int uid;
+ int refcnt;
+ unsigned long long sk;
+ int rto, ato, qack, cwnd, ssthresh;
+};
+
+char *tmr_name[] = {
+ "off",
+ "on",
+ "keepalive",
+ "timewait",
+ "persist",
+ "unknown"
+};
+
+char *print_ms_timer(int timeout)
+{
+ static char buf[64];
+ int secs, msecs, minutes;
+ if (timeout < 0)
+ timeout = 0;
+ secs = timeout/1000;
+ minutes = secs/60;
+ secs = secs%60;
+ msecs = timeout%1000;
+ buf[0] = 0;
+ if (minutes) {
+ msecs = 0;
+ snprintf(buf, sizeof(buf)-16, "%dmin", minutes);
+ if (minutes > 9)
+ secs = 0;
+ }
+ if (secs) {
+ if (secs > 9)
+ msecs = 0;
+ sprintf(buf+strlen(buf), "%d%s", secs, msecs ? "." : "sec");
+ }
+ if (msecs)
+ sprintf(buf+strlen(buf), "%03dms", msecs);
+ return buf;
+};
+
+char *print_hz_timer(int timeout)
+{
+ int hz = get_hz();
+ return print_ms_timer(((timeout*1000) + hz-1)/hz);
+};
+
+struct scache
+{
+ struct scache *next;
+ int port;
+ char *name;
+ const char *proto;
+};
+
+struct scache *rlist;
+
+void init_service_resolver(void)
+{
+ char buf[128];
+ FILE *fp = popen("/usr/sbin/rpcinfo -p 2>/dev/null", "r");
+ if (fp) {
+ fgets(buf, sizeof(buf), fp);
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ unsigned int progn, port;
+ char proto[128], prog[128];
+ if (sscanf(buf, "%u %*d %s %u %s", &progn, proto,
+ &port, prog+4) == 4) {
+ struct scache *c = malloc(sizeof(*c));
+ if (c) {
+ c->port = port;
+ memcpy(prog, "rpc.", 4);
+ c->name = strdup(prog);
+ if (strcmp(proto, TCP_PROTO) == 0)
+ c->proto = TCP_PROTO;
+ else if (strcmp(proto, UDP_PROTO) == 0)
+ c->proto = UDP_PROTO;
+ else
+ c->proto = NULL;
+ c->next = rlist;
+ rlist = c;
+ }
+ }
+ }
+ }
+}
+
+const char *__resolve_service(int port)
+{
+ struct scache *c;
+
+ for (c = rlist; c; c = c->next) {
+ if (c->port == port && c->proto == dg_proto)
+ return c->name;
+ }
+
+ /* Even do not try default linux ephemeral port ranges:
+ * default /etc/services contains so much of useless crap
+ * wouldbe "allocated" to this area that resolution
+ * is really harmful. I shrug each time when seeing
+ * "socks" or "cfinger" in dumps.
+ */
+ if (port < 32768 && (port < 1024 || port > 4999)) {
+ static int notfirst;
+ struct servent *se;
+ if (!notfirst) {
+ setservent(1);
+ notfirst = 1;
+ }
+ se = getservbyport(htons(port), dg_proto);
+ if (se)
+ return se->s_name;
+ }
+
+ return NULL;
+}
+
+
+const char *resolve_service(int port)
+{
+ static char buf[128];
+ static struct scache cache[256];
+
+ if (port == 0) {
+ buf[0] = '*';
+ buf[1] = 0;
+ return buf;
+ }
+
+ if (resolve_services) {
+ if (dg_proto == RAW_PROTO) {
+ return inet_proto_n2a(port, buf, sizeof(buf));
+ } else {
+ struct scache *c;
+ const char *res;
+ int hash = (port^(((unsigned long)dg_proto)>>2))&255;
+
+ for (c = &cache[hash]; c; c = c->next) {
+ if (c->port == port &&
+ c->proto == dg_proto) {
+ if (c->name)
+ return c->name;
+ goto do_numeric;
+ }
+ }
+
+ if ((res = __resolve_service(port)) != NULL) {
+ if ((c = malloc(sizeof(*c))) == NULL)
+ goto do_numeric;
+ } else {
+ c = &cache[hash];
+ if (c->name)
+ free(c->name);
+ }
+ c->port = port;
+ c->name = NULL;
+ c->proto = dg_proto;
+ if (res) {
+ c->name = strdup(res);
+ c->next = cache[hash].next;
+ cache[hash].next = c;
+ }
+ if (c->name)
+ return c->name;
+ }
+ }
+
+ do_numeric:
+ sprintf(buf, "%u", port);
+ return buf;
+}
+
+void formatted_print(inet_prefix *a, int port)
+{
+ char buf[1024];
+ const char *ap = buf;
+ int est_len;
+
+ est_len = addr_width;
+
+ if (a->family == AF_INET) {
+ if (a->data[0] == 0) {
+ buf[0] = '*';
+ buf[1] = 0;
+ } else {
+ ap = format_host(AF_INET, 4, a->data, buf, sizeof(buf));
+ }
+ } else {
+ ap = format_host(a->family, 16, a->data, buf, sizeof(buf));
+ est_len = strlen(ap);
+ if (est_len <= addr_width)
+ est_len = addr_width;
+ else
+ est_len = addr_width + ((est_len-addr_width+3)/4)*4;
+ }
+ printf("%*s:%-*s ", est_len, ap, serv_width, resolve_service(port));
+}
+
+struct aafilter
+{
+ inet_prefix addr;
+ int port;
+ struct aafilter *next;
+};
+
+int inet2_addr_match(inet_prefix *a, inet_prefix *p, int plen)
+{
+ if (!inet_addr_match(a, p, plen))
+ return 0;
+ /* Cursed "v4 mapped" addresses: v4 mapped socket matches
+ * pure IPv4 rule, but v4-mapped rule selects only v4-mapped
+ * sockets. Fair? */
+ if (p->family == AF_INET && a->family == AF_INET6) {
+ if (a->data[0] == 0 && a->data[1] == 0 &&
+ a->data[2] == htonl(0xffff)) {
+ inet_prefix tmp = *a;
+ tmp.data[0] = a->data[3];
+ return inet_addr_match(&tmp, p, plen);
+ }
+ }
+ return 1;
+}
+
+int unix_match(inet_prefix *a, inet_prefix *p)
+{
+ char *addr, *pattern;
+ memcpy(&addr, a->data, sizeof(addr));
+ memcpy(&pattern, p->data, sizeof(pattern));
+ if (pattern == NULL)
+ return 1;
+ if (addr == NULL)
+ addr = "";
+ return !fnmatch(pattern, addr, 0);
+}
+
+int run_ssfilter(struct ssfilter *f, struct tcpstat *s)
+{
+ switch (f->type) {
+ case SSF_S_AUTO:
+ {
+ static int low, high=65535;
+
+ if (s->local.family == AF_UNIX) {
+ char *p;
+ memcpy(&p, s->local.data, sizeof(p));
+ return p == NULL || (p[0] == '@' && strlen(p) == 6 &&
+ strspn(p+1, "0123456789abcdef") == 5);
+ }
+ if (s->local.family == AF_PACKET)
+ return s->lport == 0 && s->local.data == 0;
+ if (s->local.family == AF_NETLINK)
+ return s->lport < 0;
+
+ if (!low) {
+ FILE *fp = fdopen(ephemeral_ports_open(), "r");
+ if (fp) {
+ fscanf(fp, "%d%d", &low, &high);
+ fclose(fp);
+ }
+ }
+ return s->lport >= low && s->lport <= high;
+ }
+ case SSF_DCOND:
+ {
+ struct aafilter *a = (void*)f->pred;
+ if (a->addr.family == AF_UNIX)
+ return unix_match(&s->remote, &a->addr);
+ if (a->port != -1 && a->port != s->rport)
+ return 0;
+ if (a->addr.bitlen) {
+ do {
+ if (!inet2_addr_match(&s->remote, &a->addr, a->addr.bitlen))
+ return 1;
+ } while ((a = a->next) != NULL);
+ return 0;
+ }
+ return 1;
+ }
+ case SSF_SCOND:
+ {
+ struct aafilter *a = (void*)f->pred;
+ if (a->addr.family == AF_UNIX)
+ return unix_match(&s->local, &a->addr);
+ if (a->port != -1 && a->port != s->lport)
+ return 0;
+ if (a->addr.bitlen) {
+ do {
+ if (!inet2_addr_match(&s->local, &a->addr, a->addr.bitlen))
+ return 1;
+ } while ((a = a->next) != NULL);
+ return 0;
+ }
+ return 1;
+ }
+ case SSF_D_GE:
+ {
+ struct aafilter *a = (void*)f->pred;
+ return s->rport >= a->port;
+ }
+ case SSF_D_LE:
+ {
+ struct aafilter *a = (void*)f->pred;
+ return s->rport <= a->port;
+ }
+ case SSF_S_GE:
+ {
+ struct aafilter *a = (void*)f->pred;
+ return s->lport >= a->port;
+ }
+ case SSF_S_LE:
+ {
+ struct aafilter *a = (void*)f->pred;
+ return s->lport <= a->port;
+ }
+
+ /* Yup. It is recursion. Sorry. */
+ case SSF_AND:
+ return run_ssfilter(f->pred, s) && run_ssfilter(f->post, s);
+ case SSF_OR:
+ return run_ssfilter(f->pred, s) || run_ssfilter(f->post, s);
+ case SSF_NOT:
+ return !run_ssfilter(f->pred, s);
+ default:
+ abort();
+ }
+}
+
+/* Relocate external jumps by reloc. */
+void ssfilter_patch(char *a, int len, int reloc)
+{
+ while (len > 0) {
+ struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)a;
+ if (op->no == len+4)
+ op->no += reloc;
+ len -= op->yes;
+ a += op->yes;
+ }
+ if (len < 0)
+ abort();
+}
+
+int ssfilter_bytecompile(struct ssfilter *f, char **bytecode)
+{
+ switch (f->type) {
+ case SSF_S_AUTO:
+ {
+ if (!(*bytecode=malloc(4))) abort();
+ ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_AUTO, 4, 8 };
+ return 8;
+ }
+ case SSF_DCOND:
+ case SSF_SCOND:
+ {
+ struct aafilter *a = (void*)f->pred;
+ struct aafilter *b;
+ char *ptr;
+ int code = (f->type == SSF_DCOND ? TCPDIAG_BC_D_COND : TCPDIAG_BC_S_COND);
+ int len = 0;
+
+ for (b=a; b; b=b->next) {
+ len += 4 + sizeof(struct tcpdiag_hostcond);
+ if (a->addr.family == AF_INET6)
+ len += 16;
+ else
+ len += 4;
+ if (b->next)
+ len += 4;
+ }
+ if (!(ptr = malloc(len))) abort();
+ *bytecode = ptr;
+ for (b=a; b; b=b->next) {
+ struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op *)ptr;
+ int alen = (a->addr.family == AF_INET6 ? 16 : 4);
+ int oplen = alen + 4 + sizeof(struct tcpdiag_hostcond);
+ struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(ptr+4);
+
+ *op = (struct tcpdiag_bc_op){ code, oplen, oplen+4 };
+ cond->family = a->addr.family;
+ cond->port = a->port;
+ cond->prefix_len = a->addr.bitlen;
+ memcpy(cond->addr, a->addr.data, alen);
+ ptr += oplen;
+ if (b->next) {
+ op = (struct tcpdiag_bc_op *)ptr;
+ *op = (struct tcpdiag_bc_op){ TCPDIAG_BC_JMP, 4, len - (ptr-*bytecode)};
+ ptr += 4;
+ }
+ }
+ return ptr - *bytecode;
+ }
+ case SSF_D_GE:
+ {
+ struct aafilter *x = (void*)f->pred;
+ if (!(*bytecode=malloc(8))) abort();
+ ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_D_GE, 8, 12 };
+ ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port };
+ return 8;
+ }
+ case SSF_D_LE:
+ {
+ struct aafilter *x = (void*)f->pred;
+ if (!(*bytecode=malloc(8))) abort();
+ ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_D_LE, 8, 12 };
+ ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port };
+ return 8;
+ }
+ case SSF_S_GE:
+ {
+ struct aafilter *x = (void*)f->pred;
+ if (!(*bytecode=malloc(8))) abort();
+ ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_S_GE, 8, 12 };
+ ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port };
+ return 8;
+ }
+ case SSF_S_LE:
+ {
+ struct aafilter *x = (void*)f->pred;
+ if (!(*bytecode=malloc(8))) abort();
+ ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_S_LE, 8, 12 };
+ ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port };
+ return 8;
+ }
+
+ case SSF_AND:
+ {
+ char *a1, *a2, *a, l1, l2;
+ l1 = ssfilter_bytecompile(f->pred, &a1);
+ l2 = ssfilter_bytecompile(f->post, &a2);
+ if (!(a = malloc(l1+l2))) abort();
+ memcpy(a, a1, l1);
+ memcpy(a+l1, a2, l2);
+ free(a1); free(a2);
+ ssfilter_patch(a, l1, l2);
+ *bytecode = a;
+ return l1+l2;
+ }
+ case SSF_OR:
+ {
+ char *a1, *a2, *a, l1, l2;
+ l1 = ssfilter_bytecompile(f->pred, &a1);
+ l2 = ssfilter_bytecompile(f->post, &a2);
+ if (!(a = malloc(l1+l2+4))) abort();
+ memcpy(a, a1, l1);
+ memcpy(a+l1+4, a2, l2);
+ free(a1); free(a2);
+ *(struct tcpdiag_bc_op*)(a+l1) = (struct tcpdiag_bc_op){ TCPDIAG_BC_JMP, 4, l2+4 };
+ *bytecode = a;
+ return l1+l2+4;
+ }
+ case SSF_NOT:
+ {
+ char *a1, *a, l1;
+ l1 = ssfilter_bytecompile(f->pred, &a1);
+ if (!(a = malloc(l1+4))) abort();
+ memcpy(a, a1, l1);
+ free(a1);
+ *(struct tcpdiag_bc_op*)(a+l1) = (struct tcpdiag_bc_op){ TCPDIAG_BC_JMP, 4, 8 };
+ *bytecode = a;
+ return l1+4;
+ }
+ default:
+ abort();
+ }
+}
+
+int remember_he(struct aafilter *a, struct hostent *he)
+{
+ char **ptr = he->h_addr_list;
+ int cnt = 0;
+ int len;
+
+ if (he->h_addrtype == AF_INET)
+ len = 4;
+ else if (he->h_addrtype == AF_INET6)
+ len = 16;
+ else
+ return 0;
+
+ while (*ptr) {
+ struct aafilter *b = a;
+ if (a->addr.bitlen) {
+ if ((b = malloc(sizeof(*b))) == NULL)
+ return cnt;
+ *b = *a;
+ b->next = a->next;
+ a->next = b;
+ }
+ memcpy(b->addr.data, *ptr, len);
+ b->addr.bytelen = len;
+ b->addr.bitlen = len*8;
+ b->addr.family = he->h_addrtype;
+ ptr++;
+ cnt++;
+ }
+ return cnt;
+}
+
+int get_dns_host(struct aafilter *a, char *addr, int fam)
+{
+ static int notfirst;
+ int cnt = 0;
+ struct hostent *he;
+
+ a->addr.bitlen = 0;
+ if (!notfirst) {
+ sethostent(1);
+ notfirst = 1;
+ }
+ he = gethostbyname2(addr, fam == AF_UNSPEC ? AF_INET : fam);
+ if (he)
+ cnt = remember_he(a, he);
+ if (fam == AF_UNSPEC) {
+ he = gethostbyname2(addr, AF_INET6);
+ if (he)
+ cnt += remember_he(a, he);
+ }
+ return !cnt;
+}
+
+int xll_initted = 0;
+
+void xll_init(void)
+{
+ struct rtnl_handle rth;
+ rtnl_open(&rth, 0);
+ ll_init_map(&rth);
+ rtnl_close(&rth);
+ xll_initted = 1;
+}
+
+const char *xll_index_to_name(int index)
+{
+ if (!xll_initted)
+ xll_init();
+ return ll_index_to_name(index);
+}
+
+int xll_name_to_index(char *dev)
+{
+ if (!xll_initted)
+ xll_init();
+ return ll_name_to_index(dev);
+}
+
+void *parse_hostcond(char *addr)
+{
+ char *port = NULL;
+ struct aafilter a;
+ struct aafilter *res;
+ int fam = preferred_family;
+
+ memset(&a, 0, sizeof(a));
+ a.port = -1;
+
+ if (fam == AF_UNIX || strncmp(addr, "unix:", 5) == 0) {
+ char *p;
+ a.addr.family = AF_UNIX;
+ if (strncmp(addr, "unix:", 5) == 0)
+ addr+=5;
+ p = strdup(addr);
+ a.addr.bitlen = 8*strlen(p);
+ memcpy(a.addr.data, &p, sizeof(p));
+ goto out;
+ }
+
+ if (fam == AF_PACKET || strncmp(addr, "link:", 5) == 0) {
+ a.addr.family = AF_PACKET;
+ a.addr.bitlen = 0;
+ if (strncmp(addr, "link:", 5) == 0)
+ addr+=5;
+ port = strchr(addr, ':');
+ if (port) {
+ *port = 0;
+ if (port[1] && strcmp(port+1, "*")) {
+ if (get_integer(&a.port, port+1, 0)) {
+ if ((a.port = xll_name_to_index(port+1)) <= 0)
+ return NULL;
+ }
+ }
+ }
+ if (addr[0] && strcmp(addr, "*")) {
+ unsigned short tmp;
+ a.addr.bitlen = 32;
+ if (ll_proto_a2n(&tmp, addr))
+ return NULL;
+ a.addr.data[0] = ntohs(tmp);
+ }
+ goto out;
+ }
+
+ if (fam == AF_NETLINK || strncmp(addr, "netlink:", 8) == 0) {
+ a.addr.family = AF_NETLINK;
+ a.addr.bitlen = 0;
+ if (strncmp(addr, "netlink:", 8) == 0)
+ addr+=8;
+ port = strchr(addr, ':');
+ if (port) {
+ *port = 0;
+ if (port[1] && strcmp(port+1, "*")) {
+ if (get_integer(&a.port, port+1, 0)) {
+ if (strcmp(port+1, "kernel") == 0)
+ a.port = 0;
+ else
+ return NULL;
+ }
+ }
+ }
+ if (addr[0] && strcmp(addr, "*")) {
+ a.addr.bitlen = 32;
+ if (get_u32(a.addr.data, addr, 0)) {
+ if (strcmp(addr, "rtnl") == 0)
+ a.addr.data[0] = 0;
+ else if (strcmp(addr, "fw") == 0)
+ a.addr.data[0] = 3;
+ else if (strcmp(addr, "tcpdiag") == 0)
+ a.addr.data[0] = 4;
+ else
+ return NULL;
+ }
+ }
+ goto out;
+ }
+
+ if (strncmp(addr, "inet:", 5) == 0) {
+ addr += 5;
+ fam = AF_INET;
+ } else if (strncmp(addr, "inet6:", 6) == 0) {
+ addr += 6;
+ fam = AF_INET6;
+ }
+
+ /* URL-like literal [] */
+ if (addr[0] == '[') {
+ addr++;
+ if ((port = strchr(addr, ']')) == NULL)
+ return NULL;
+ *port++ = 0;
+ } else if (addr[0] == '*') {
+ port = addr+1;
+ } else {
+ port = strrchr(strchr(addr, '/') ? : addr, ':');
+ }
+ if (port && *port) {
+ if (*port != ':')
+ return NULL;
+ *port++ = 0;
+ if (*port && *port != '*') {
+ if (get_integer(&a.port, port, 0)) {
+ struct servent *se1 = NULL;
+ struct servent *se2 = NULL;
+ if (current_filter.dbs&(1<<UDP_DB))
+ se1 = getservbyname(port, UDP_PROTO);
+ if (current_filter.dbs&(1<<TCP_DB))
+ se2 = getservbyname(port, TCP_PROTO);
+ if (se1 && se2 && se1->s_port != se2->s_port) {
+ fprintf(stderr, "Error: ambiguous port \"%s\".\n", port);
+ return NULL;
+ }
+ if (!se1)
+ se1 = se2;
+ if (se1) {
+ a.port = ntohs(se1->s_port);
+ } else {
+ struct scache *s;
+ for (s = rlist; s; s = s->next) {
+ if ((s->proto == UDP_PROTO &&
+ (current_filter.dbs&(1<<UDP_DB))) ||
+ (s->proto == TCP_PROTO &&
+ (current_filter.dbs&(1<<TCP_DB)))) {
+ if (s->name && strcmp(s->name, port) == 0) {
+ if (a.port > 0 && a.port != s->port) {
+ fprintf(stderr, "Error: ambiguous port \"%s\".\n", port);
+ return NULL;
+ }
+ a.port = s->port;
+ }
+ }
+ }
+ if (a.port <= 0) {
+ fprintf(stderr, "Error: \"%s\" does not look like a port.\n", port);
+ return NULL;
+ }
+ }
+ }
+ }
+ }
+ if (addr && *addr && *addr != '*') {
+ if (get_prefix_1(&a.addr, addr, fam)) {
+ if (get_dns_host(&a, addr, fam)) {
+ fprintf(stderr, "Error: an inet prefix is expected rather than \"%s\".\n", addr);
+ return NULL;
+ }
+ }
+ }
+
+ out:
+ res = malloc(sizeof(*res));
+ if (res)
+ memcpy(res, &a, sizeof(a));
+ return res;
+}
+
+int tcp_show_line(char *line, struct filter *f, int family)
+{
+ struct tcpstat s;
+ char *loc, *rem, *data;
+ char opt[256];
+ int n;
+ char *p;
+
+ if ((p = strchr(line, ':')) == NULL)
+ return -1;
+ loc = p+2;
+
+ if ((p = strchr(loc, ':')) == NULL)
+ return -1;
+ p[5] = 0;
+ rem = p+6;
+
+ if ((p = strchr(rem, ':')) == NULL)
+ return -1;
+ p[5] = 0;
+ data = p+6;
+
+ do {
+ int state = (data[1] >= 'A') ? (data[1] - 'A' + 10) : (data[1] - '0');
+
+ if (!(f->states & (1<<state)))
+ return 0;
+ } while (0);
+
+ s.local.family = s.remote.family = family;
+ if (family == AF_INET) {
+ sscanf(loc, "%x:%x", s.local.data, (unsigned*)&s.lport);
+ sscanf(rem, "%x:%x", s.remote.data, (unsigned*)&s.rport);
+ s.local.bytelen = s.remote.bytelen = 4;
+ } else {
+ sscanf(loc, "%08x%08x%08x%08x:%x",
+ s.local.data,
+ s.local.data+1,
+ s.local.data+2,
+ s.local.data+3,
+ &s.lport);
+ sscanf(rem, "%08x%08x%08x%08x:%x",
+ s.remote.data,
+ s.remote.data+1,
+ s.remote.data+2,
+ s.remote.data+3,
+ &s.rport);
+ s.local.bytelen = s.remote.bytelen = 16;
+ }
+
+ if (f->f && run_ssfilter(f->f, &s) == 0)
+ return 0;
+
+ opt[0] = 0;
+ n = sscanf(data, "%x %x:%x %x:%x %x %d %d %d %d %llx %d %d %d %d %d %[^\n]\n",
+ &s.state, &s.wq, &s.rq,
+ &s.timer, &s.timeout, &s.retrs, &s.uid, &s.probes, &s.ino,
+ &s.refcnt, &s.sk, &s.rto, &s.ato, &s.qack,
+ &s.cwnd, &s.ssthresh, opt);
+
+ if (n < 17)
+ opt[0] = 0;
+
+ if (n < 12) {
+ s.rto = 0;
+ s.cwnd = 2;
+ s.ssthresh = -1;
+ s.ato = s.qack = 0;
+ }
+
+ if (netid_width)
+ printf("%-*s ", netid_width, "tcp");
+ if (state_width)
+ printf("%-*s ", state_width, sstate_name[s.state]);
+
+ printf("%-6d %-6d ", s.rq, s.wq);
+
+ formatted_print(&s.local, s.lport);
+ formatted_print(&s.remote, s.rport);
+
+ if (show_options) {
+ if (s.timer) {
+ if (s.timer > 4)
+ s.timer = 5;
+ printf(" timer:(%s,%s,%d)",
+ tmr_name[s.timer],
+ print_hz_timer(s.timeout),
+ s.timer != 1 ? s.probes : s.retrs);
+ }
+ }
+ if (show_tcpinfo) {
+ if (s.rto && s.rto != 3*get_hz())
+ printf(" rto:%g", (double)s.rto/get_hz());
+ if (s.ato)
+ printf(" ato:%g", (double)s.ato/get_hz());
+ if (s.cwnd != 2)
+ printf(" cwnd:%d", s.cwnd);
+ if (s.ssthresh != -1)
+ printf(" ssthresh:%d", s.ssthresh);
+ if (s.qack/2)
+ printf(" qack:%d", s.qack/2);
+ if (s.qack&1)
+ printf(" bidir");
+ }
+ if (show_users) {
+ char ubuf[4096];
+ if (find_users(s.ino, ubuf, sizeof(ubuf)) > 0)
+ printf(" users:(%s)", ubuf);
+ }
+ if (show_details) {
+ if (s.uid)
+ printf(" uid:%u", (unsigned)s.uid);
+ printf(" ino:%u", (unsigned)s.ino);
+ printf(" sk:%llx", s.sk);
+ if (opt[0])
+ printf(" opt:\"%s\"", opt);
+ }
+ printf("\n");
+
+ return 0;
+}
+
+int generic_record_read(int fd, char *buf, int bufsize,
+ int (*worker)(char*, struct filter *, int),
+ struct filter *f, int fam)
+{
+ int n;
+ int recsize;
+ int eof = 0;
+ char *p;
+
+ /* Load the first chunk and calculate record length from it. */
+ n = read(fd, buf, bufsize);
+ if (n < 0)
+ goto outerr;
+ /* I _know_ that this is wrong, do not remind. :-)
+ * But this works nowadays. */
+ if (n < bufsize)
+ eof = 1;
+ p = memchr(buf, '\n', n);
+ if (p == NULL || (p-buf) >= n)
+ goto outwrongformat;
+ recsize = (p-buf)+1;
+ p = buf+recsize;
+
+ for (;;) {
+ while ((p+recsize) - buf <= n) {
+ if (p[recsize-1] != '\n')
+ goto outwrongformat;
+ p[recsize-1] = 0;
+ if (worker(p, f, fam) < 0)
+ goto done;
+ p += recsize;
+ }
+ if (!eof) {
+ int remains = (buf+bufsize) - p;
+ memcpy(buf, p, remains);
+ p = buf+remains;
+ n = read(fd, p, (buf+bufsize) - p);
+ if (n < 0)
+ goto outerr;
+ if (n < (buf+bufsize) - p) {
+ eof = 1;
+ if (n == 0) {
+ if (remains)
+ goto outwrongformat;
+ goto done;
+ }
+ }
+ n += remains;
+ p = buf;
+ } else {
+ if (p != buf+n)
+ goto outwrongformat;
+ goto done;
+ }
+ }
+done:
+ return 0;
+
+outwrongformat:
+ errno = EINVAL;
+outerr:
+ return -1;
+}
+
+
+int tcp_show_sock(struct nlmsghdr *nlh, struct filter *f)
+{
+ struct tcpdiagmsg *r = NLMSG_DATA(nlh);
+ struct tcpstat s;
+
+ s.state = r->tcpdiag_state;
+ s.local.family = s.remote.family = r->tcpdiag_family;
+ s.lport = ntohs(r->id.tcpdiag_sport);
+ s.rport = ntohs(r->id.tcpdiag_dport);
+ if (s.local.family == AF_INET) {
+ s.local.bytelen = s.remote.bytelen = 4;
+ } else {
+ s.local.bytelen = s.remote.bytelen = 16;
+ }
+ memcpy(s.local.data, r->id.tcpdiag_src, s.local.bytelen);
+ memcpy(s.remote.data, r->id.tcpdiag_dst, s.local.bytelen);
+
+ if (f && f->f && run_ssfilter(f->f, &s) == 0)
+ return 0;
+
+ if (netid_width)
+ printf("%-*s ", netid_width, "tcp");
+ if (state_width)
+ printf("%-*s ", state_width, sstate_name[s.state]);
+
+ printf("%-6d %-6d ", r->tcpdiag_rqueue, r->tcpdiag_wqueue);
+
+ formatted_print(&s.local, s.lport);
+ formatted_print(&s.remote, s.rport);
+
+ if (show_options) {
+ if (r->tcpdiag_timer) {
+ if (r->tcpdiag_timer > 4)
+ r->tcpdiag_timer = 5;
+ printf(" timer:(%s,%s,%d)",
+ tmr_name[r->tcpdiag_timer],
+ print_ms_timer(r->tcpdiag_expires),
+ r->tcpdiag_retrans);
+ }
+ }
+ if (show_users) {
+ char ubuf[4096];
+ if (find_users(r->tcpdiag_inode, ubuf, sizeof(ubuf)) > 0)
+ printf(" users:(%s)", ubuf);
+ }
+ if (show_details) {
+ if (r->tcpdiag_uid)
+ printf(" uid:%u", (unsigned)r->tcpdiag_uid);
+ printf(" ino:%u", (unsigned)r->tcpdiag_inode);
+ printf(" sk:%08x", r->id.tcpdiag_cookie[0]);
+ if (r->id.tcpdiag_cookie[1] != 0)
+ printf("%08x", r->id.tcpdiag_cookie[1]);
+ }
+ if (show_mem || show_tcpinfo) {
+ struct rtattr * tb[TCPDIAG_MAX+1];
+ struct tcpdiag_meminfo *minfo = NULL;
+ struct tcp_info *info = NULL;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCPDIAG_MAX, (struct rtattr*)(r+1),
+ nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
+ if (tb[TCPDIAG_MEMINFO])
+ minfo = RTA_DATA(tb[TCPDIAG_MEMINFO]);
+ if (tb[TCPDIAG_INFO])
+ info = RTA_DATA(tb[TCPDIAG_INFO]);
+ if (minfo) {
+ printf(" mem:(r%u,w%u,f%u,t%u)",
+ minfo->tcpdiag_rmem,
+ minfo->tcpdiag_wmem,
+ minfo->tcpdiag_fmem,
+ minfo->tcpdiag_tmem);
+ }
+ if (info) {
+#ifdef TCP_INFO
+ if (info->tcpi_rto && info->tcpi_rto != 3000000)
+ printf(" rto:%g", (double)info->tcpi_rto/1000);
+ if (info->tcpi_rtt)
+ printf(" rtt:%g/%g", (double)info->tcpi_rtt/1000,
+ (double)info->tcpi_rttvar/1000);
+ if (info->tcpi_ato)
+ printf(" ato:%g", (double)info->tcpi_ato/1000);
+ if (info->tcpi_snd_cwnd != 2)
+ printf(" cwnd:%d", info->tcpi_snd_cwnd);
+ if (info->tcpi_snd_ssthresh < 0xFFFF)
+ printf(" ssthresh:%d", info->tcpi_snd_ssthresh);
+#else
+#warning No TCP_INFO. Please, do not repeat this experiment, use right kernel.
+ printf(" MORE_INFO_PROVIDED_YOU_COMPILED_SS_RIGHT");
+#endif
+ }
+ }
+ printf("\n");
+
+ return 0;
+
+}
+
+int tcp_show_netlink(struct filter *f, FILE *dump_fp)
+{
+ int fd;
+ struct sockaddr_nl nladdr;
+ struct {
+ struct nlmsghdr nlh;
+ struct tcpdiagreq r;
+ } req;
+ char *bc = NULL;
+ int bclen;
+ struct msghdr msg;
+ struct rtattr rta;
+ char buf[8192];
+ struct iovec iov[3];
+
+ if ((fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_TCPDIAG)) < 0)
+ return -1;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = TCPDIAG_GETSOCK;
+ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = 123456;
+ memset(&req.r, 0, sizeof(req.r));
+ req.r.tcpdiag_family = AF_INET;
+ req.r.tcpdiag_states = f->states;
+ if (show_mem)
+ req.r.tcpdiag_ext |= (1<<(TCPDIAG_MEMINFO-1));
+ if (show_tcpinfo)
+ req.r.tcpdiag_ext |= (1<<(TCPDIAG_INFO-1));
+
+ iov[0] = (struct iovec){ &req, sizeof(req) };
+ if (f->f) {
+ bclen = ssfilter_bytecompile(f->f, &bc);
+ rta.rta_type = TCPDIAG_REQ_BYTECODE;
+ rta.rta_len = RTA_LENGTH(bclen);
+ iov[1] = (struct iovec){ &rta, sizeof(rta) };
+ iov[2] = (struct iovec){ bc, bclen };
+ req.nlh.nlmsg_len += RTA_LENGTH(bclen);
+ }
+
+ msg = (struct msghdr) {
+ (void*)&nladdr, sizeof(nladdr),
+ iov, f->f ? 3 : 1,
+ NULL, 0,
+ 0
+ };
+
+ if (sendmsg(fd, &msg, 0) < 0)
+ return -1;
+
+
+ iov[0] = (struct iovec){ buf, sizeof(buf) };
+
+ while (1) {
+ int status;
+ struct nlmsghdr *h;
+
+ msg = (struct msghdr) {
+ (void*)&nladdr, sizeof(nladdr),
+ iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ status = recvmsg(fd, &msg, 0);
+
+ if (status < 0) {
+ if (errno == EINTR)
+ continue;
+ perror("OVERRUN");
+ continue;
+ }
+ if (status == 0) {
+ fprintf(stderr, "EOF on netlink\n");
+ return 0;
+ }
+
+ if (dump_fp)
+ fwrite(buf, 1, NLMSG_ALIGN(status), dump_fp);
+
+ h = (struct nlmsghdr*)buf;
+ while (NLMSG_OK(h, status)) {
+ int err;
+
+ if (/*h->nlmsg_pid != rth->local.nl_pid ||*/
+ h->nlmsg_seq != 123456)
+ goto skip_it;
+
+ if (h->nlmsg_type == NLMSG_DONE)
+ return 0;
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+ if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+ fprintf(stderr, "ERROR truncated\n");
+ } else {
+ errno = -err->error;
+ perror("TCPDIAG answers");
+ }
+ return 0;
+ }
+ if (!dump_fp) {
+ err = tcp_show_sock(h, NULL);
+ if (err < 0)
+ return err;
+ }
+
+skip_it:
+ h = NLMSG_NEXT(h, status);
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Message truncated\n");
+ continue;
+ }
+ if (status) {
+ fprintf(stderr, "!!!Remnant of size %d\n", status);
+ exit(1);
+ }
+ }
+ return 0;
+}
+
+int tcp_show_netlink_file(struct filter *f)
+{
+ FILE *fp;
+ char buf[8192];
+
+ if ((fp = fopen(getenv("TCPDIAG_FILE"), "r")) == NULL) {
+ perror("fopen($TCPDIAG_FILE)");
+ return -1;
+ }
+
+ while (1) {
+ int status, err;
+ struct nlmsghdr *h = (struct nlmsghdr*)buf;
+
+ status = fread(buf, 1, sizeof(*h), fp);
+ if (status < 0) {
+ perror("Reading header from $TCPDIAG_FILE");
+ return -1;
+ }
+ if (status != sizeof(*h)) {
+ perror("Unexpected EOF reading $TCPDIAG_FILE");
+ return -1;
+ }
+
+ status = fread(h+1, 1, NLMSG_ALIGN(h->nlmsg_len-sizeof(*h)), fp);
+
+ if (status < 0) {
+ perror("Reading $TCPDIAG_FILE");
+ return -1;
+ }
+ if (status + sizeof(*h) < h->nlmsg_len) {
+ perror("Unexpected EOF reading $TCPDIAG_FILE");
+ return -1;
+ }
+
+ /* The only legal exit point */
+ if (h->nlmsg_type == NLMSG_DONE)
+ return 0;
+
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+ if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+ fprintf(stderr, "ERROR truncated\n");
+ } else {
+ errno = -err->error;
+ perror("TCPDIAG answered");
+ }
+ return -1;
+ }
+
+ err = tcp_show_sock(h, f);
+ if (err < 0)
+ return err;
+ }
+}
+
+int tcp_show(struct filter *f)
+{
+ int fd = -1;
+ char *buf = NULL;
+ int bufsize = 64*1024;
+
+ dg_proto = TCP_PROTO;
+
+ if (getenv("TCPDIAG_FILE"))
+ return tcp_show_netlink_file(f);
+
+ if (!getenv("PROC_NET_TCP") && !getenv("PROC_ROOT")
+ && tcp_show_netlink(f, NULL) == 0)
+ return 0;
+
+ /* Sigh... We have to parse /proc/net/tcp... */
+
+ /* Estimate amount of sockets and try to allocate
+ * huge buffer to read all the table at one read.
+ * Limit it by 16MB though. The assumption is: as soon as
+ * kernel was able to hold information about N connections,
+ * it is able to give us some memory for snapshot.
+ */
+ if (1) {
+ int guess = slabstat.socks+slabstat.tcp_syns;
+ if (f->states&(1<<SS_TIME_WAIT))
+ guess += slabstat.tcp_tws;
+ if (guess > (16*1024*1024)/128)
+ guess = (16*1024*1024)/128;
+ guess *= 128;
+ if (guess > bufsize)
+ bufsize = guess;
+ }
+ while (bufsize >= 64*1024) {
+ if ((buf = malloc(bufsize)) != NULL)
+ break;
+ bufsize /= 2;
+ }
+ if (buf == NULL) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ if (f->families & (1<<AF_INET)) {
+ if ((fd = net_tcp_open()) < 0)
+ goto outerr;
+ if (generic_record_read(fd, buf, bufsize, tcp_show_line, f, AF_INET))
+ goto outerr;
+ close(fd);
+ }
+
+ if ((f->families & (1<<AF_INET6)) &&
+ (fd = net_tcp6_open()) >= 0) {
+ if (generic_record_read(fd, buf, bufsize, tcp_show_line, f, AF_INET6))
+ goto outerr;
+ close(fd);
+ }
+
+ free(buf);
+ return 0;
+
+outerr:
+ do {
+ int saved_errno = errno;
+ if (buf)
+ free(buf);
+ if (fd >= 0)
+ close(fd);
+ errno = saved_errno;
+ return -1;
+ } while (0);
+}
+
+
+int dgram_show_line(char *line, struct filter *f, int family)
+{
+ struct tcpstat s;
+ char *loc, *rem, *data;
+ char opt[256];
+ int n;
+ char *p;
+
+ if ((p = strchr(line, ':')) == NULL)
+ return -1;
+ loc = p+2;
+
+ if ((p = strchr(loc, ':')) == NULL)
+ return -1;
+ p[5] = 0;
+ rem = p+6;
+
+ if ((p = strchr(rem, ':')) == NULL)
+ return -1;
+ p[5] = 0;
+ data = p+6;
+
+ do {
+ int state = (data[1] >= 'A') ? (data[1] - 'A' + 10) : (data[1] - '0');
+
+ if (!(f->states & (1<<state)))
+ return 0;
+ } while (0);
+
+ s.local.family = s.remote.family = family;
+ if (family == AF_INET) {
+ sscanf(loc, "%x:%x", s.local.data, (unsigned*)&s.lport);
+ sscanf(rem, "%x:%x", s.remote.data, (unsigned*)&s.rport);
+ s.local.bytelen = s.remote.bytelen = 4;
+ } else {
+ sscanf(loc, "%08x%08x%08x%08x:%x",
+ s.local.data,
+ s.local.data+1,
+ s.local.data+2,
+ s.local.data+3,
+ &s.lport);
+ sscanf(rem, "%08x%08x%08x%08x:%x",
+ s.remote.data,
+ s.remote.data+1,
+ s.remote.data+2,
+ s.remote.data+3,
+ &s.rport);
+ s.local.bytelen = s.remote.bytelen = 16;
+ }
+
+ if (f->f && run_ssfilter(f->f, &s) == 0)
+ return 0;
+
+ opt[0] = 0;
+ n = sscanf(data, "%x %x:%x %*x:%*x %*x %d %*d %d %d %llx %[^\n]\n",
+ &s.state, &s.wq, &s.rq,
+ &s.uid, &s.ino,
+ &s.refcnt, &s.sk, opt);
+
+ if (n < 9)
+ opt[0] = 0;
+
+ if (netid_width)
+ printf("%-*s ", netid_width, dg_proto);
+ if (state_width)
+ printf("%-*s ", state_width, sstate_name[s.state]);
+
+ printf("%-6d %-6d ", s.rq, s.wq);
+
+ formatted_print(&s.local, s.lport);
+ formatted_print(&s.remote, s.rport);
+
+ if (show_users) {
+ char ubuf[4096];
+ if (find_users(s.ino, ubuf, sizeof(ubuf)) > 0)
+ printf(" users:(%s)", ubuf);
+ }
+
+ if (show_details) {
+ if (s.uid)
+ printf(" uid=%u", (unsigned)s.uid);
+ printf(" ino=%u", (unsigned)s.ino);
+ printf(" sk=%llx", s.sk);
+ if (opt[0])
+ printf(" opt:\"%s\"", opt);
+ }
+ printf("\n");
+
+ return 0;
+}
+
+
+int udp_show(struct filter *f)
+{
+ int fd = -1;
+ char buf[8192];
+ int bufsize = sizeof(buf);
+
+ dg_proto = UDP_PROTO;
+
+ if (f->families&(1<<AF_INET)) {
+ if ((fd = net_udp_open()) < 0)
+ goto outerr;
+ if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET))
+ goto outerr;
+ close(fd);
+ }
+
+ if ((f->families&(1<<AF_INET6)) &&
+ (fd = net_udp6_open()) >= 0) {
+ if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET6))
+ goto outerr;
+ close(fd);
+ }
+ return 0;
+
+outerr:
+ do {
+ int saved_errno = errno;
+ if (fd >= 0)
+ close(fd);
+ errno = saved_errno;
+ return -1;
+ } while (0);
+}
+
+int raw_show(struct filter *f)
+{
+ int fd = -1;
+ char buf[8192];
+ int bufsize = sizeof(buf);
+
+ dg_proto = RAW_PROTO;
+
+ if (f->families&(1<<AF_INET)) {
+ if ((fd = net_raw_open()) < 0)
+ goto outerr;
+ if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET))
+ goto outerr;
+ close(fd);
+ }
+
+ if ((f->families&(1<<AF_INET6)) &&
+ (fd = net_raw6_open()) >= 0) {
+ if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET6))
+ goto outerr;
+ close(fd);
+ }
+ return 0;
+
+outerr:
+ do {
+ int saved_errno = errno;
+ if (fd >= 0)
+ close(fd);
+ errno = saved_errno;
+ return -1;
+ } while (0);
+}
+
+
+struct unixstat
+{
+ struct unixstat *next;
+ int ino;
+ int peer;
+ int rq;
+ int wq;
+ int state;
+ int type;
+ char *name;
+};
+
+
+
+int unix_state_map[] = { SS_CLOSE, SS_SYN_SENT,
+ SS_ESTABLISHED, SS_CLOSING };
+
+
+#define MAX_UNIX_REMEMBER (1024*1024/sizeof(struct unixstat))
+
+void unix_list_free(struct unixstat *list)
+{
+ while (list) {
+ struct unixstat *s = list;
+ list = list->next;
+ if (s->name)
+ free(s->name);
+ free(s);
+ }
+}
+
+void unix_list_print(struct unixstat *list, struct filter *f)
+{
+ struct unixstat *s;
+ char *peer;
+
+ for (s = list; s; s = s->next) {
+ if (!(f->states & (1<<s->state)))
+ continue;
+ if (s->type == SOCK_STREAM && !(f->dbs&(1<<UNIX_ST_DB)))
+ continue;
+ if (s->type == SOCK_DGRAM && !(f->dbs&(1<<UNIX_DG_DB)))
+ continue;
+
+ peer = "*";
+ if (s->peer) {
+ struct unixstat *p;
+ for (p = list; p; p = p->next) {
+ if (s->peer == p->ino)
+ break;
+ }
+ if (!p) {
+ peer = "?";
+ } else {
+ peer = p->name ? : "*";
+ }
+ }
+
+ if (f->f) {
+ struct tcpstat tst;
+ tst.local.family = AF_UNIX;
+ tst.remote.family = AF_UNIX;
+ memcpy(tst.local.data, &s->name, sizeof(s->name));
+ if (strcmp(peer, "*") == 0)
+ memset(tst.remote.data, 0, sizeof(peer));
+ else
+ memcpy(tst.remote.data, &peer, sizeof(peer));
+ if (run_ssfilter(f->f, &tst) == 0)
+ continue;
+ }
+
+ if (netid_width)
+ printf("%-*s ", netid_width,
+ s->type == SOCK_STREAM ? "u_str" : "u_dgr");
+ if (state_width)
+ printf("%-*s ", state_width, sstate_name[s->state]);
+ printf("%-6d %-6d ", s->rq, s->wq);
+ printf("%*s %-*d %*s %-*d",
+ addr_width, s->name ? : "*", serv_width, s->ino,
+ addr_width, peer, serv_width, s->peer);
+ if (show_users) {
+ char ubuf[4096];
+ if (find_users(s->ino, ubuf, sizeof(ubuf)) > 0)
+ printf(" users:(%s)", ubuf);
+ }
+ printf("\n");
+ }
+}
+
+int unix_show(struct filter *f)
+{
+ FILE *fp;
+ char buf[256];
+ char name[128];
+ int newformat = 0;
+ int cnt;
+ struct unixstat *list = NULL;
+
+ if ((fp = fdopen(net_unix_open(), "r")) == NULL)
+ return -1;
+ fgets(buf, sizeof(buf)-1, fp);
+
+ if (memcmp(buf, "Peer", 4) == 0)
+ newformat = 1;
+ cnt = 0;
+
+ while (fgets(buf, sizeof(buf)-1, fp)) {
+ struct unixstat *u, **insp;
+ int flags;
+
+ if (!(u = malloc(sizeof(*u))))
+ break;
+ u->name = NULL;
+
+ if (sscanf(buf, "%x: %x %x %x %x %x %d %s",
+ &u->peer, &u->rq, &u->wq, &flags, &u->type,
+ &u->state, &u->ino, name) < 8)
+ name[0] = 0;
+
+ if (flags&(1<<16)) {
+ u->state = SS_LISTEN;
+ } else {
+ u->state = unix_state_map[u->state-1];
+ if (u->type == SOCK_DGRAM &&
+ u->state == SS_CLOSE &&
+ u->peer)
+ u->state = SS_ESTABLISHED;
+ }
+
+ if (!newformat) {
+ u->peer = 0;
+ u->rq = 0;
+ u->wq = 0;
+ }
+
+ insp = &list;
+ while (*insp) {
+ if (u->type < (*insp)->type ||
+ (u->type == (*insp)->type &&
+ u->ino < (*insp)->ino))
+ break;
+ insp = &(*insp)->next;
+ }
+ u->next = *insp;
+ *insp = u;
+
+ if (name[0]) {
+ if ((u->name = malloc(strlen(name)+1)) == NULL)
+ break;
+ strcpy(u->name, name);
+ }
+ if (++cnt > MAX_UNIX_REMEMBER) {
+ unix_list_print(list, f);
+ unix_list_free(list);
+ list = NULL;
+ cnt = 0;
+ }
+ }
+
+ if (list) {
+ unix_list_print(list, f);
+ unix_list_free(list);
+ list = NULL;
+ cnt = 0;
+ }
+
+ return 0;
+}
+
+
+int packet_show(struct filter *f)
+{
+ FILE *fp;
+ char buf[256];
+ int type;
+ int prot;
+ int iface;
+ int state;
+ int rq;
+ int uid;
+ int ino;
+ unsigned long long sk;
+
+ if (!(f->states & (1<<SS_CLOSE)))
+ return 0;
+
+ if ((fp = fdopen(net_packet_open(), "r")) == NULL)
+ return -1;
+ fgets(buf, sizeof(buf)-1, fp);
+
+ while (fgets(buf, sizeof(buf)-1, fp)) {
+ sscanf(buf, "%llx %*d %d %x %d %d %u %u %u",
+ &sk,
+ &type, &prot, &iface, &state,
+ &rq, &uid, &ino);
+
+ if (type == SOCK_RAW && !(f->dbs&(1<<PACKET_R_DB)))
+ continue;
+ if (type == SOCK_DGRAM && !(f->dbs&(1<<PACKET_DG_DB)))
+ continue;
+ if (f->f) {
+ struct tcpstat tst;
+ tst.local.family = AF_PACKET;
+ tst.remote.family = AF_PACKET;
+ tst.rport = 0;
+ tst.lport = iface;
+ tst.local.data[0] = prot;
+ tst.remote.data[0] = 0;
+ if (run_ssfilter(f->f, &tst) == 0)
+ continue;
+ }
+
+ if (netid_width)
+ printf("%-*s ", netid_width,
+ type == SOCK_RAW ? "p_raw" : "p_dgr");
+ if (state_width)
+ printf("%-*s ", state_width, "UNCONN");
+ printf("%-6d %-6d ", rq, 0);
+ if (prot == 3) {
+ printf("%*s:", addr_width, "*");
+ } else {
+ char tb[16];
+ printf("%*s:", addr_width,
+ ll_proto_n2a(htons(prot), tb, sizeof(tb)));
+ }
+ if (iface == 0) {
+ printf("%-*s ", serv_width, "*");
+ } else {
+ printf("%-*s ", serv_width, xll_index_to_name(iface));
+ }
+ printf("%*s*%-*s",
+ addr_width, "", serv_width, "");
+
+ if (show_users) {
+ char ubuf[4096];
+ if (find_users(ino, ubuf, sizeof(ubuf)) > 0)
+ printf(" users:(%s)", ubuf);
+ }
+ if (show_details) {
+ printf(" ino=%u uid=%u sk=%llx", ino, uid, sk);
+ }
+ printf("\n");
+ }
+
+ return 0;
+}
+
+int netlink_show(struct filter *f)
+{
+ FILE *fp;
+ char buf[256];
+ int prot, pid;
+ unsigned groups;
+ int rq, wq, rc;
+ unsigned long long sk, cb;
+
+ if (!(f->states & (1<<SS_CLOSE)))
+ return 0;
+
+ if ((fp = fdopen(net_netlink_open(), "r")) == NULL)
+ return -1;
+ fgets(buf, sizeof(buf)-1, fp);
+
+ while (fgets(buf, sizeof(buf)-1, fp)) {
+ sscanf(buf, "%llx %d %d %x %d %d %llx %d",
+ &sk,
+ &prot, &pid, &groups, &rq, &wq, &cb, &rc);
+
+ if (f->f) {
+ struct tcpstat tst;
+ tst.local.family = AF_NETLINK;
+ tst.remote.family = AF_NETLINK;
+ tst.rport = -1;
+ tst.lport = pid;
+ tst.local.data[0] = prot;
+ tst.remote.data[0] = 0;
+ if (run_ssfilter(f->f, &tst) == 0)
+ continue;
+ }
+
+ if (netid_width)
+ printf("%-*s ", netid_width, "nl");
+ if (state_width)
+ printf("%-*s ", state_width, "UNCONN");
+ printf("%-6d %-6d ", rq, wq);
+ if (resolve_services && prot == 0)
+ printf("%*s:", addr_width, "rtnl");
+ else if (resolve_services && prot == 3)
+ printf("%*s:", addr_width, "fw");
+ else if (resolve_services && prot == 4)
+ printf("%*s:", addr_width, "tcpdiag");
+ else
+ printf("%*d:", addr_width, prot);
+ if (pid == -1) {
+ printf("%-*s ", serv_width, "*");
+ } else if (resolve_services) {
+ int done = 0;
+ if (!pid) {
+ done = 1;
+ printf("%-*s ", serv_width, "kernel");
+ } else if (pid > 0) {
+ char procname[64];
+ FILE *fp;
+ sprintf(procname, "%s/%d/stat",
+ getenv("PROC_ROOT") ? : "/proc", pid);
+ if ((fp = fopen(procname, "r")) != NULL) {
+ if (fscanf(fp, "%*d (%[^)])", procname) == 1) {
+ sprintf(procname+strlen(procname), "/%d", pid);
+ printf("%-*s ", serv_width, procname);
+ done = 1;
+ }
+ fclose(fp);
+ }
+ }
+ if (!done)
+ printf("%-*d ", serv_width, pid);
+ } else {
+ printf("%-*d ", serv_width, pid);
+ }
+ printf("%*s*%-*s",
+ addr_width, "", serv_width, "");
+
+ if (show_details) {
+ printf(" sk=%llx cb=%llx groups=0x%08x", sk, cb, groups);
+ }
+ printf("\n");
+ }
+
+ return 0;
+}
+
+struct snmpstat
+{
+ int tcp_estab;
+};
+
+int get_snmp_int(char *proto, char *key, int *result)
+{
+ char buf[1024];
+ FILE *fp;
+ int protolen = strlen(proto);
+ int keylen = strlen(key);
+
+ *result = 0;
+
+ if ((fp = fdopen(net_snmp_open(), "r")) == NULL)
+ return -1;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ char *p = buf;
+ int pos = 0;
+ if (memcmp(buf, proto, protolen))
+ continue;
+ while ((p = strchr(p, ' ')) != NULL) {
+ pos++;
+ p++;
+ if (memcmp(p, key, keylen) == 0 &&
+ (p[keylen] == ' ' || p[keylen] == '\n'))
+ break;
+ }
+ if (fgets(buf, sizeof(buf), fp) == NULL)
+ break;
+ if (memcmp(buf, proto, protolen))
+ break;
+ p = buf;
+ while ((p = strchr(p, ' ')) != NULL) {
+ p++;
+ if (--pos == 0) {
+ sscanf(p, "%d", result);
+ fclose(fp);
+ return 0;
+ }
+ }
+ }
+
+ fclose(fp);
+ errno = ESRCH;
+ return -1;
+}
+
+
+/* Get stats from sockstat */
+
+struct sockstat
+{
+ int socks;
+ int tcp_mem;
+ int tcp_total;
+ int tcp_orphans;
+ int tcp_tws;
+ int tcp4_hashed;
+ int udp4;
+ int raw4;
+ int frag4;
+ int frag4_mem;
+ int tcp6_hashed;
+ int udp6;
+ int raw6;
+ int frag6;
+ int frag6_mem;
+};
+
+static void get_sockstat_line(char *line, struct sockstat *s)
+{
+ char id[256], rem[256];
+
+ if (sscanf(line, "%[^ ] %[^\n]\n", id, rem) != 2)
+ return;
+
+ if (strcmp(id, "sockets:") == 0)
+ sscanf(rem, "%*s%d", &s->socks);
+ else if (strcmp(id, "UDP:") == 0)
+ sscanf(rem, "%*s%d", &s->udp4);
+ else if (strcmp(id, "UDP6:") == 0)
+ sscanf(rem, "%*s%d", &s->udp6);
+ else if (strcmp(id, "RAW:") == 0)
+ sscanf(rem, "%*s%d", &s->raw4);
+ else if (strcmp(id, "RAW6:") == 0)
+ sscanf(rem, "%*s%d", &s->raw6);
+ else if (strcmp(id, "TCP6:") == 0)
+ sscanf(rem, "%*s%d", &s->tcp6_hashed);
+ else if (strcmp(id, "FRAG:") == 0)
+ sscanf(rem, "%*s%d%*s%d", &s->frag4, &s->frag4_mem);
+ else if (strcmp(id, "FRAG6:") == 0)
+ sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
+ else if (strcmp(id, "TCP:") == 0)
+ sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d",
+ &s->tcp4_hashed,
+ &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
+}
+
+int get_sockstat(struct sockstat *s)
+{
+ char buf[256];
+ FILE *fp;
+
+ memset(s, 0, sizeof(*s));
+
+ if ((fp = fdopen(net_sockstat_open(), "r")) == NULL)
+ return -1;
+ while(fgets(buf, sizeof(buf), fp) != NULL)
+ get_sockstat_line(buf, s);
+ fclose(fp);
+
+ if ((fp = fdopen(net_sockstat6_open(), "r")) == NULL)
+ return 0;
+ while(fgets(buf, sizeof(buf), fp) != NULL)
+ get_sockstat_line(buf, s);
+ fclose(fp);
+
+ return 0;
+}
+
+int print_summary(void)
+{
+ struct sockstat s;
+ struct snmpstat sn;
+
+ if (get_sockstat(&s) < 0)
+ perror("ss: get_sockstat");
+ if (get_snmp_int("Tcp:", "CurrEstab", &sn.tcp_estab) < 0)
+ perror("ss: get_snmpstat");
+
+ printf("Total: %d (kernel %d)\n", s.socks, slabstat.socks);
+
+ printf("TCP: %d (estab %d, closed %d, orphaned %d, synrecv %d, timewait %d/%d), ports %d\n",
+ s.tcp_total + slabstat.tcp_syns + s.tcp_tws,
+ sn.tcp_estab,
+ s.tcp_total - (s.tcp4_hashed+s.tcp6_hashed-s.tcp_tws),
+ s.tcp_orphans,
+ slabstat.tcp_syns,
+ s.tcp_tws, slabstat.tcp_tws,
+ slabstat.tcp_ports
+ );
+
+ printf("\n");
+ printf("Transport Total IP IPv6\n");
+ printf("* %-9d %-9s %-9s\n", slabstat.socks, "-", "-");
+ printf("RAW %-9d %-9d %-9d\n", s.raw4+s.raw6, s.raw4, s.raw6);
+ printf("UDP %-9d %-9d %-9d\n", s.udp4+s.udp6, s.udp4, s.udp6);
+ printf("TCP %-9d %-9d %-9d\n", s.tcp4_hashed+s.tcp6_hashed, s.tcp4_hashed, s.tcp6_hashed);
+ printf("INET %-9d %-9d %-9d\n",
+ s.raw4+s.udp4+s.tcp4_hashed+
+ s.raw6+s.udp6+s.tcp6_hashed,
+ s.raw4+s.udp4+s.tcp4_hashed,
+ s.raw6+s.udp6+s.tcp6_hashed);
+ printf("FRAG %-9d %-9d %-9d\n", s.frag4+s.frag6, s.frag4, s.frag6);
+
+ printf("\n");
+
+ return 0;
+}
+
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr,
+"Usage: ss [ OPTIONS ]\n"
+" ss [ OPTIONS ] [ FILTER ]\n"
+"where OPTIONS := { -h[elp] | -V[ersion] | -n[umeric] | -r[esolve] |\n"
+" -a[ll] -l[istening] -o[ptions] -e[xtended] -p[rocesses]\n"
+" -A QUERY } -s[ummary]\n"
+" -f[amily] { inet | inet6 | link | unix } }\n"
+" QUERY := {all|inet|tcp|udp|raw|unix|packet|netlink}[,QUERY]\n"
+" FILTER := [ state TCP-STATE ] [ EXPRESSION ]\n"
+);
+ exit(-1);
+}
+
+
+int scan_state(char *state)
+{
+ int i;
+ if (strcasecmp(state, "close") == 0 ||
+ strcasecmp(state, "closed") == 0)
+ return (1<<SS_CLOSE);
+ if (strcasecmp(state, "syn-rcv") == 0)
+ return (1<<SS_SYN_RECV);
+ if (matches(state, "established") == 0)
+ return (1<<SS_ESTABLISHED);
+ if (strcasecmp(state, "all") == 0)
+ return SS_ALL;
+ if (strcasecmp(state, "connected") == 0)
+ return SS_ALL & ~((1<<SS_CLOSE)|(1<<SS_LISTEN));
+ if (matches(state, "synchronized") == 0)
+ return SS_ALL & ~((1<<SS_CLOSE)|(1<<SS_LISTEN)|(1<<SS_SYN_SENT));
+ if (strcasecmp(state, "bucket") == 0)
+ return (1<<SS_SYN_RECV)|(1<<SS_TIME_WAIT);
+ if (strcasecmp(state, "big") == 0)
+ return SS_ALL & ~((1<<SS_SYN_RECV)|(1<<SS_TIME_WAIT));
+ for (i=0; i<SS_MAX; i++) {
+ if (matches(state, sstate_namel[i]) == 0)
+ return (1<<i);
+ }
+ return 0;
+}
+
+
+int main(int argc, char *argv[])
+{
+ int do_default = 1;
+ int saw_states = 0;
+ int saw_query = 0;
+ int do_summary = 0;
+ char *dump_tcpdiag = NULL;
+ FILE *filter_fp = NULL;
+ int ch;
+
+ memset(&current_filter, 0, sizeof(current_filter));
+
+ current_filter.states = default_filter.states;
+
+ while ((ch = getopt(argc, argv, "h?aletuwxnro460spfmiA:D:F:vV")) != EOF) {
+ switch(ch) {
+ case 'n':
+ resolve_services = 0;
+ break;
+ case 'r':
+ resolve_hosts = 1;
+ break;
+ case 'o':
+ show_options = 1;
+ break;
+ case 'e':
+ show_options = 1;
+ show_details++;
+ break;
+ case 'm':
+ show_mem = 1;
+ break;
+ case 'i':
+ show_tcpinfo = 1;
+ break;
+ case 'p':
+ show_users++;
+ break;
+ case 't':
+ current_filter.dbs |= (1<<TCP_DB);
+ do_default = 0;
+ break;
+ case 'u':
+ current_filter.dbs |= (1<<UDP_DB);
+ do_default = 0;
+ break;
+ case 'w':
+ current_filter.dbs |= (1<<RAW_DB);
+ do_default = 0;
+ break;
+ case 'x':
+ current_filter.dbs |= UNIX_DBM;
+ do_default = 0;
+ break;
+ case 'a':
+ current_filter.states = SS_ALL;
+ break;
+ case 'l':
+ current_filter.states = (1<<SS_LISTEN);
+ break;
+ case '4':
+ preferred_family = AF_INET;
+ break;
+ case '6':
+ preferred_family = AF_INET6;
+ break;
+ case '0':
+ preferred_family = AF_PACKET;
+ break;
+ case 'f':
+ if (strcmp(optarg, "inet") == 0)
+ preferred_family = AF_INET;
+ else if (strcmp(optarg, "inet6") == 0)
+ preferred_family = AF_INET6;
+ else if (strcmp(optarg, "link") == 0)
+ preferred_family = AF_PACKET;
+ else if (strcmp(optarg, "unix") == 0)
+ preferred_family = AF_UNIX;
+ else if (strcmp(optarg, "netlink") == 0)
+ preferred_family = AF_NETLINK;
+ else if (strcmp(optarg, "help") == 0)
+ usage();
+ else {
+ fprintf(stderr, "ss: \"%s\" is invalid family\n", optarg);
+ usage();
+ }
+ break;
+ case 'A':
+ {
+ char *p, *p1;
+ if (!saw_query) {
+ current_filter.dbs = 0;
+ saw_query = 1;
+ do_default = 0;
+ }
+ p = p1 = optarg;
+ do {
+ if ((p1 = strchr(p, ',')) != NULL)
+ *p1 = 0;
+ if (strcmp(p, "all") == 0) {
+ current_filter.dbs = ALL_DB;
+ } else if (strcmp(p, "inet") == 0) {
+ current_filter.dbs |= (1<<TCP_DB)|(1<<UDP_DB)|(1<<RAW_DB);
+ } else if (strcmp(p, "udp") == 0) {
+ current_filter.dbs |= (1<<UDP_DB);
+ } else if (strcmp(p, "tcp") == 0) {
+ current_filter.dbs |= (1<<TCP_DB);
+ } else if (strcmp(p, "raw") == 0) {
+ current_filter.dbs |= (1<<RAW_DB);
+ } else if (strcmp(p, "unix") == 0) {
+ current_filter.dbs |= UNIX_DBM;
+ } else if (matches(p, "unix_stream") == 0 ||
+ strcmp(p, "u_str") == 0) {
+ current_filter.dbs |= (1<<UNIX_ST_DB);
+ } else if (matches(p, "unix_dgram") == 0 ||
+ strcmp(p, "u_dgr") == 0) {
+ current_filter.dbs |= (1<<UNIX_DG_DB);
+ } else if (strcmp(p, "packet") == 0) {
+ current_filter.dbs |= PACKET_DBM;
+ } else if (strcmp(p, "packet_raw") == 0 ||
+ strcmp(p, "p_raw") == 0) {
+ current_filter.dbs |= (1<<PACKET_R_DB);
+ } else if (strcmp(p, "packet_dgram") == 0 ||
+ strcmp(p, "p_dgr") == 0) {
+ current_filter.dbs |= (1<<PACKET_DG_DB);
+ } else if (strcmp(p, "netlink") == 0) {
+ current_filter.dbs |= (1<<NETLINK_DB);
+ } else {
+ fprintf(stderr, "ss: \"%s\" is illegal socket table id\n", p);
+ usage();
+ }
+ p = p1 + 1;
+ } while (p1);
+ break;
+ }
+ case 's':
+ do_summary = 1;
+ break;
+ case 'D':
+ dump_tcpdiag = optarg;
+ break;
+ case 'F':
+ if (filter_fp) {
+ fprintf(stderr, "More than one filter file\n");
+ exit(-1);
+ }
+ if (optarg[0] == '-')
+ filter_fp = stdin;
+ else
+ filter_fp = fopen(optarg, "r");
+ if (!filter_fp) {
+ perror("fopen filter file");
+ exit(-1);
+ }
+ break;
+ case 'v':
+ case 'V':
+ printf("ss utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ case 'h':
+ case '?':
+ default:
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ get_slabstat(&slabstat);
+
+ if (do_summary) {
+ print_summary();
+ if (do_default && argc == 0)
+ exit(0);
+ }
+
+ if (do_default)
+ current_filter.dbs = default_filter.dbs;
+
+ if (preferred_family == AF_UNSPEC) {
+ if (!(current_filter.dbs&~UNIX_DBM))
+ preferred_family = AF_UNIX;
+ else if (!(current_filter.dbs&~PACKET_DBM))
+ preferred_family = AF_PACKET;
+ else if (!(current_filter.dbs&~(1<<NETLINK_DB)))
+ preferred_family = AF_NETLINK;
+ }
+
+ if (preferred_family != AF_UNSPEC) {
+ int mask2;
+ if (preferred_family == AF_INET ||
+ preferred_family == AF_INET6) {
+ mask2= (1<<TCP_DB);
+ if (!do_default)
+ mask2 = (1<<UDP_DB)|(1<<RAW_DB);
+ } else if (preferred_family == AF_PACKET) {
+ mask2 = PACKET_DBM;
+ } else if (preferred_family == AF_UNIX) {
+ mask2 = UNIX_DBM;
+ } else if (preferred_family == AF_NETLINK) {
+ mask2 = (1<<NETLINK_DB);
+ } else {
+ mask2 = 0;
+ }
+
+ if (do_default)
+ current_filter.dbs = mask2;
+ else
+ current_filter.dbs &= mask2;
+ current_filter.families = (1<<preferred_family);
+ } else {
+ if (!do_default)
+ current_filter.families = ~0;
+ else
+ current_filter.families = default_filter.families;
+ }
+ if (current_filter.dbs == 0) {
+ fprintf(stderr, "ss: no socket tables to show with such filter.\n");
+ exit(0);
+ }
+ if (current_filter.families == 0) {
+ fprintf(stderr, "ss: no families to show with such filter.\n");
+ exit(0);
+ }
+
+ if (resolve_services && resolve_hosts &&
+ (current_filter.dbs&(UNIX_DBM|(1<<TCP_DB)|(1<<UDP_DB))))
+ init_service_resolver();
+
+ /* Now parse filter... */
+ if (argc == 0 && filter_fp) {
+ if (ssfilter_parse(&current_filter.f, 0, NULL, filter_fp))
+ usage();
+ }
+
+ while (argc > 0) {
+ if (strcmp(*argv, "state") == 0) {
+ NEXT_ARG();
+ if (!saw_states)
+ current_filter.states = 0;
+ current_filter.states |= scan_state(*argv);
+ saw_states = 1;
+ } else if (strcmp(*argv, "exclude") == 0 ||
+ strcmp(*argv, "excl") == 0) {
+ NEXT_ARG();
+ if (!saw_states)
+ current_filter.states = SS_ALL;
+ current_filter.states &= ~scan_state(*argv);
+ saw_states = 1;
+ } else {
+ if (ssfilter_parse(&current_filter.f, argc, argv, filter_fp))
+ usage();
+ break;
+ }
+ argc--; argv++;
+ }
+
+ if (current_filter.states == 0) {
+ fprintf(stderr, "ss: no socket states to show with such filter.\n");
+ exit(0);
+ }
+
+ if (dump_tcpdiag) {
+ FILE *dump_fp = stdout;
+ if (!(current_filter.dbs & (1<<TCP_DB))) {
+ fprintf(stderr, "ss: tcpdiag dump requested and no tcp in filter.\n");
+ exit(0);
+ }
+ if (dump_tcpdiag[0] != '-') {
+ dump_fp = fopen(dump_tcpdiag, "w");
+ if (!dump_tcpdiag) {
+ perror("fopen dump file");
+ exit(-1);
+ }
+ }
+ tcp_show_netlink(&current_filter, dump_fp);
+ fflush(dump_fp);
+ exit(0);
+ }
+
+ netid_width = 0;
+ if (current_filter.dbs&(current_filter.dbs-1))
+ netid_width = 5;
+
+ state_width = 0;
+ if (current_filter.states&(current_filter.states-1))
+ state_width = 10;
+
+ screen_width = 80;
+ if (isatty(STDOUT_FILENO)) {
+ struct winsize w;
+
+ if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) != -1) {
+ if (w.ws_col > 0)
+ screen_width = w.ws_col;
+ }
+ }
+
+ addrp_width = screen_width;
+ addrp_width -= netid_width+1;
+ addrp_width -= state_width+1;
+ addrp_width -= 14;
+
+ if (addrp_width&1) {
+ if (netid_width)
+ netid_width++;
+ else if (state_width)
+ state_width++;
+ }
+
+ addrp_width /= 2;
+ addrp_width--;
+
+ serv_width = resolve_services ? 7 : 5;
+
+ if (addrp_width < 15+serv_width+1)
+ addrp_width = 15+serv_width+1;
+
+ addr_width = addrp_width - serv_width - 1;
+
+ if (netid_width)
+ printf("%-*s ", netid_width, "Netid");
+ if (state_width)
+ printf("%-*s ", state_width, "State");
+ printf("%-6s %-6s ", "Recv-Q", "Send-Q");
+
+ printf("%*s:%-*s %*s:%-*s\n",
+ addr_width, "Local Address", serv_width, "Port",
+ addr_width, "Peer Address", serv_width, "Port");
+
+//printf("%08x %08x %08x\n", current_filter.dbs, current_filter.states, current_filter.families);
+ fflush(stdout);
+
+ if (current_filter.dbs & (1<<NETLINK_DB))
+ netlink_show(&current_filter);
+ if (current_filter.dbs & PACKET_DBM)
+ packet_show(&current_filter);
+ if (current_filter.dbs & UNIX_DBM)
+ unix_show(&current_filter);
+ if (current_filter.dbs & (1<<RAW_DB))
+ raw_show(&current_filter);
+ if (current_filter.dbs & (1<<UDP_DB))
+ udp_show(&current_filter);
+ if (current_filter.dbs & (1<<TCP_DB))
+ tcp_show(&current_filter);
+ return 0;
+}
diff --git a/misc/ssfilter.h b/misc/ssfilter.h
index e69de29b..00b92e3d 100644
--- a/misc/ssfilter.h
+++ b/misc/ssfilter.h
@@ -0,0 +1,21 @@
+#define SSF_DCOND 0
+#define SSF_SCOND 1
+#define SSF_OR 2
+#define SSF_AND 3
+#define SSF_NOT 4
+#define SSF_D_GE 5
+#define SSF_D_LE 6
+#define SSF_S_GE 7
+#define SSF_S_LE 8
+#define SSF_S_AUTO 9
+
+struct ssfilter
+{
+ int type;
+ struct ssfilter *post;
+ struct ssfilter *pred;
+};
+
+int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp);
+void *parse_hostcond(char*);
+
diff --git a/misc/ssfilter.y b/misc/ssfilter.y
index e69de29b..f47ab2fd 100644
--- a/misc/ssfilter.y
+++ b/misc/ssfilter.y
@@ -0,0 +1,274 @@
+%{
+
+#include <stdio.h>
+#include <malloc.h>
+#include <string.h>
+#include "ssfilter.h"
+
+typedef struct ssfilter * ssfilter_t;
+
+#define YYSTYPE ssfilter_t
+
+static struct ssfilter * alloc_node(int type, void *pred)
+{
+ struct ssfilter *n = malloc(sizeof(*n));
+ if (n == NULL)
+ abort();
+ n->type = type;
+ n->pred = pred;
+ n->post = NULL;
+ return n;
+}
+
+static char **yy_argv;
+static int yy_argc;
+static FILE *yy_fp;
+static ssfilter_t *yy_ret;
+
+static int yylex(void);
+
+static void yyerror(char *s)
+{
+ fprintf(stderr, "ss: bison bellows (while parsing filter): \"%s!\"", s);
+}
+
+%}
+
+%token HOSTCOND DCOND SCOND DPORT SPORT LEQ GEQ NEQ AUTOBOUND
+%left '|'
+%left '&'
+%nonassoc '!'
+
+%%
+applet: null expr
+ {
+ *yy_ret = $2;
+ $$ = $2;
+ }
+ | null
+ ;
+null: /* NOTHING */ { $$ = NULL; }
+ ;
+expr: DCOND HOSTCOND
+ {
+ $$ = alloc_node(SSF_DCOND, $2);
+ }
+ | SCOND HOSTCOND
+ {
+ $$ = alloc_node(SSF_SCOND, $2);
+ }
+ | DPORT GEQ HOSTCOND
+ {
+ $$ = alloc_node(SSF_D_GE, $3);
+ }
+ | DPORT LEQ HOSTCOND
+ {
+ $$ = alloc_node(SSF_D_LE, $3);
+ }
+ | DPORT '>' HOSTCOND
+ {
+ $$ = alloc_node(SSF_NOT, alloc_node(SSF_D_LE, $3));
+ }
+ | DPORT '<' HOSTCOND
+ {
+ $$ = alloc_node(SSF_NOT, alloc_node(SSF_D_GE, $3));
+ }
+ | DPORT '=' HOSTCOND
+ {
+ $$ = alloc_node(SSF_DCOND, $3);
+ }
+ | DPORT NEQ HOSTCOND
+ {
+ $$ = alloc_node(SSF_NOT, alloc_node(SSF_DCOND, $3));
+ }
+
+ | SPORT GEQ HOSTCOND
+ {
+ $$ = alloc_node(SSF_S_GE, $3);
+ }
+ | SPORT LEQ HOSTCOND
+ {
+ $$ = alloc_node(SSF_S_LE, $3);
+ }
+ | SPORT '>' HOSTCOND
+ {
+ $$ = alloc_node(SSF_NOT, alloc_node(SSF_S_LE, $3));
+ }
+ | SPORT '<' HOSTCOND
+ {
+ $$ = alloc_node(SSF_NOT, alloc_node(SSF_S_GE, $3));
+ }
+ | SPORT '=' HOSTCOND
+ {
+ $$ = alloc_node(SSF_SCOND, $3);
+ }
+ | SPORT NEQ HOSTCOND
+ {
+ $$ = alloc_node(SSF_NOT, alloc_node(SSF_SCOND, $3));
+ }
+
+ | AUTOBOUND
+ {
+ $$ = alloc_node(SSF_S_AUTO, NULL);
+ }
+ | expr '|' expr
+ {
+ $$ = alloc_node(SSF_OR, $1);
+ $$->post = $3;
+ }
+ | expr expr
+ {
+ $$ = alloc_node(SSF_AND, $1);
+ $$->post = $2;
+ }
+ | expr '&' expr
+
+ {
+ $$ = alloc_node(SSF_AND, $1);
+ $$->post = $3;
+ }
+ | '!' expr
+ {
+ $$ = alloc_node(SSF_NOT, $2);
+ }
+ | '(' expr ')'
+ {
+ $$ = $2;
+ }
+;
+%%
+
+static char *get_token_from_line(char **ptr)
+{
+ char *tok, *cp = *ptr;
+
+ while (*cp == ' ' || *cp == '\t') cp++;
+
+ if (*cp == 0) {
+ *ptr = cp;
+ return NULL;
+ }
+
+ tok = cp;
+
+ while (*cp != 0 && *cp != ' ' && *cp != '\t') {
+ /* Backslash escapes everything. */
+ if (*cp == '\\') {
+ char *tp;
+ for (tp = cp; tp != tok; tp--)
+ *tp = *(tp-1);
+ cp++;
+ tok++;
+ if (*cp == 0)
+ break;
+ }
+ cp++;
+ }
+ if (*cp)
+ *cp++ = 0;
+ *ptr = cp;
+ return tok;
+}
+
+int yylex(void)
+{
+ static char argbuf[1024];
+ static char *tokptr = argbuf;
+ static int argc;
+ char *curtok;
+
+ do {
+ while (*tokptr == 0) {
+ tokptr = NULL;
+ if (argc < yy_argc) {
+ tokptr = yy_argv[argc];
+ argc++;
+ } else if (yy_fp) {
+ while (tokptr == NULL) {
+ if (fgets(argbuf, sizeof(argbuf)-1, yy_fp) == NULL)
+ return 0;
+ argbuf[sizeof(argbuf)-1] = 0;
+ if (strlen(argbuf) == sizeof(argbuf) - 1) {
+ fprintf(stderr, "Too long line in filter");
+ exit(-1);
+ }
+ if (argbuf[strlen(argbuf)-1] == '\n')
+ argbuf[strlen(argbuf)-1] = 0;
+ if (argbuf[0] == '#' || argbuf[0] == '0')
+ continue;
+ tokptr = argbuf;
+ }
+ } else {
+ return 0;
+ }
+ }
+ } while ((curtok = get_token_from_line(&tokptr)) == NULL);
+
+ if (strcmp(curtok, "!") == 0 ||
+ strcmp(curtok, "not") == 0)
+ return '!';
+ if (strcmp(curtok, "&") == 0 ||
+ strcmp(curtok, "&&") == 0 ||
+ strcmp(curtok, "and") == 0)
+ return '&';
+ if (strcmp(curtok, "|") == 0 ||
+ strcmp(curtok, "||") == 0 ||
+ strcmp(curtok, "or") == 0)
+ return '|';
+ if (strcmp(curtok, "(") == 0)
+ return '(';
+ if (strcmp(curtok, ")") == 0)
+ return ')';
+ if (strcmp(curtok, "dst") == 0)
+ return DCOND;
+ if (strcmp(curtok, "src") == 0)
+ return SCOND;
+ if (strcmp(curtok, "dport") == 0)
+ return DPORT;
+ if (strcmp(curtok, "sport") == 0)
+ return SPORT;
+ if (strcmp(curtok, ">=") == 0 ||
+ strcmp(curtok, "ge") == 0 ||
+ strcmp(curtok, "geq") == 0)
+ return GEQ;
+ if (strcmp(curtok, "<=") == 0 ||
+ strcmp(curtok, "le") == 0 ||
+ strcmp(curtok, "leq") == 0)
+ return LEQ;
+ if (strcmp(curtok, "!=") == 0 ||
+ strcmp(curtok, "ne") == 0 ||
+ strcmp(curtok, "neq") == 0)
+ return NEQ;
+ if (strcmp(curtok, "=") == 0 ||
+ strcmp(curtok, "==") == 0 ||
+ strcmp(curtok, "eq") == 0)
+ return '=';
+ if (strcmp(curtok, ">") == 0 ||
+ strcmp(curtok, "gt") == 0)
+ return '>';
+ if (strcmp(curtok, "<") == 0 ||
+ strcmp(curtok, "lt") == 0)
+ return '<';
+ if (strcmp(curtok, "autobound") == 0)
+ return AUTOBOUND;
+ yylval = (void*)parse_hostcond(curtok);
+ if (yylval == NULL) {
+ fprintf(stderr, "Cannot parse dst/src address.\n");
+ exit(1);
+ }
+ return HOSTCOND;
+}
+
+int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp)
+{
+ yy_argc = argc;
+ yy_argv = argv;
+ yy_fp = fp;
+ yy_ret = f;
+
+ if (yyparse()) {
+ fprintf(stderr, " Sorry.\n");
+ return -1;
+ }
+ return 0;
+}
diff --git a/tc/Makefile b/tc/Makefile
index e69de29b..ec1d3399 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -0,0 +1,54 @@
+TCOBJ=tc.o tc_qdisc.o tc_class.o tc_filter.o tc_util.o m_police.o m_estimator.o
+
+include ../Config
+
+TCMODULES :=
+TCMODULES += q_fifo.o
+TCMODULES += q_sfq.o
+TCMODULES += q_red.o
+TCMODULES += q_prio.o
+TCMODULES += q_tbf.o
+TCMODULES += q_cbq.o
+TCMODULES += f_rsvp.o
+TCMODULES += f_u32.o
+TCMODULES += f_route.o
+TCMODULES += f_fw.o
+ifeq ($(TC_CONFIG_DIFFSERV),y)
+ TCMODULES += q_dsmark.o
+ TCMODULES += q_gred.o
+ TCMODULES += f_tcindex.o
+ TCMODULES += q_ingress.o
+endif
+ifeq ($(TC_CONFIG_ATM),y)
+ TCMODULES += q_atm.o
+ LDLIBS += -latm
+endif
+
+#TCMODULES += q_csz.o
+#TCMODULES += q_hpfq.o
+#TCMODULES += q_hfsc.o
+
+TCOBJ += $(TCMODULES)
+
+TCLIB := tc_core.o
+TCLIB += tc_red.o
+TCLIB += tc_cbq.o
+TCLIB += tc_estimator.o
+
+LDLIBS += -L. -ltc -lm -ldl
+LDFLAGS += -Wl,-export-dynamic
+
+all: libtc.a tc
+
+tc: $(TCOBJ) $(LIBNETLINK) $(LIBUTIL) $(TCLIB)
+
+libtc.a: $(TCLIB)
+ $(AR) rcs $@ $(TCLIB)
+
+install: all
+ install -m 0755 -s tc $(DESTDIR)$(SBINDIR)
+
+
+clean:
+ rm -f $(TCOBJ) $(TCLIB) libtc.a tc
+
diff --git a/tc/README.last b/tc/README.last
index e69de29b..9400438a 100644
--- a/tc/README.last
+++ b/tc/README.last
@@ -0,0 +1,47 @@
+Kernel code and interface.
+--------------------------
+
+* Compile time switches
+
+There is only one, but very important, compile time switch.
+It is not settable by "make config", but should be selected
+manually and after a bit of thinking in <include/net/pkt_sched.h>
+
+PSCHED_CLOCK_SOURCE can take three values:
+
+ PSCHED_GETTIMEOFDAY
+ PSCHED_JIFFIES
+ PSCHED_CPU
+
+
+ PSCHED_GETTIMEOFDAY
+
+Default setting is the most conservative PSCHED_GETTIMEOFDAY.
+It is very slow both because of weird slowness of do_gettimeofday()
+and because it forces code to use unnatural "timeval" format,
+where microseconds and seconds fields are separate.
+Besides that, it will misbehave, when delays exceed 2 seconds
+(f.e. very slow links or classes bounded to small slice of bandwidth)
+To resume: as only you will get it working, select correct clock
+source and forget about PSCHED_GETTIMEOFDAY forever.
+
+
+ PSCHED_JIFFIES
+
+Clock is derived from jiffies. On architectures with HZ=100
+granularity of this clock is not enough to make reasonable
+bindings to real time. However, taking into account Linux
+architecture problems, which force us to use artificial
+integrated clock in any case, this switch is not so bad
+for schduling even on high speed networks, though policing
+is not reliable.
+
+
+ PSCHED_CPU
+
+It is available only for alpha and pentiums with correct
+CPU timestamp. It is the fastest way, use it when it is available,
+but remember: not all pentiums have this facility, and
+a lot of them have clock, broken by APM etc. etc.
+
+
diff --git a/tc/f_fw.c b/tc/f_fw.c
index e69de29b..3c5e3e2f 100644
--- a/tc/f_fw.c
+++ b/tc/f_fw.c
@@ -0,0 +1,116 @@
+/*
+ * f_fw.c FW filter.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... fw [ classid CLASSID ] [ police POLICE_SPEC ]\n");
+ fprintf(stderr, " POLICE_SPEC := ... look at TBF\n");
+ fprintf(stderr, " CLASSID := X:Y\n");
+}
+
+#define usage() return(-1)
+
+static int fw_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n)
+{
+ struct tc_police tp;
+ struct tcmsg *t = NLMSG_DATA(n);
+ struct rtattr *tail;
+
+ memset(&tp, 0, sizeof(tp));
+
+ if (handle) {
+ if (get_u32(&t->tcm_handle, handle, 0)) {
+ fprintf(stderr, "Illegal \"handle\"\n");
+ return -1;
+ }
+ }
+
+ if (argc == 0)
+ return 0;
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 4096, TCA_OPTIONS, NULL, 0);
+
+ while (argc > 0) {
+ if (matches(*argv, "classid") == 0 ||
+ matches(*argv, "flowid") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_tc_classid(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"classid\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_FW_CLASSID, &handle, 4);
+ } else if (matches(*argv, "police") == 0) {
+ NEXT_ARG();
+ if (parse_police(&argc, &argv, TCA_FW_POLICE, n)) {
+ fprintf(stderr, "Illegal \"police\"\n");
+ return -1;
+ }
+ continue;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+ tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
+ return 0;
+}
+
+static int fw_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle)
+{
+ struct rtattr *tb[TCA_FW_MAX+1];
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ if (opt)
+ parse_rtattr(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (handle)
+ fprintf(f, "handle 0x%x ", handle);
+
+ if (tb[TCA_FW_CLASSID]) {
+ SPRINT_BUF(b1);
+ fprintf(f, "classid %s ", sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_FW_CLASSID]), b1));
+ }
+
+ if (tb[TCA_FW_POLICE])
+ tc_print_police(f, tb[TCA_FW_POLICE]);
+ return 0;
+}
+
+struct filter_util fw_util = {
+ NULL,
+ "fw",
+ fw_parse_opt,
+ fw_print_opt,
+};
diff --git a/tc/f_route.c b/tc/f_route.c
index e69de29b..f13c28b5 100644
--- a/tc/f_route.c
+++ b/tc/f_route.c
@@ -0,0 +1,175 @@
+/*
+ * f_route.c ROUTE filter.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "rt_names.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... route [ from REALM | fromif TAG ] [ to REALM ]\n");
+ fprintf(stderr, " [ flowid CLASSID ] [ police POLICE_SPEC ]\n");
+ fprintf(stderr, " POLICE_SPEC := ... look at TBF\n");
+ fprintf(stderr, " CLASSID := X:Y\n");
+}
+
+#define usage() return(-1)
+
+static int route_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n)
+{
+ struct tc_police tp;
+ struct tcmsg *t = NLMSG_DATA(n);
+ struct rtattr *tail;
+ __u32 fh = 0xFFFF8000;
+ __u32 order = 0;
+
+ memset(&tp, 0, sizeof(tp));
+
+ if (handle) {
+ if (get_u32(&t->tcm_handle, handle, 0)) {
+ fprintf(stderr, "Illegal \"handle\"\n");
+ return -1;
+ }
+ }
+
+ if (argc == 0)
+ return 0;
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 4096, TCA_OPTIONS, NULL, 0);
+
+ while (argc > 0) {
+ if (matches(*argv, "to") == 0) {
+ __u32 id;
+ NEXT_ARG();
+ if (rtnl_rtrealm_a2n(&id, *argv)) {
+ fprintf(stderr, "Illegal \"to\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_ROUTE4_TO, &id, 4);
+ fh &= ~0x80FF;
+ fh |= id&0xFF;
+ } else if (matches(*argv, "from") == 0) {
+ __u32 id;
+ NEXT_ARG();
+ if (rtnl_rtrealm_a2n(&id, *argv)) {
+ fprintf(stderr, "Illegal \"from\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_ROUTE4_FROM, &id, 4);
+ fh &= 0xFFFF;
+ fh |= id<<16;
+ } else if (matches(*argv, "fromif") == 0) {
+ struct rtnl_handle rth;
+ __u32 id;
+ NEXT_ARG();
+ if (rtnl_open(&rth, 0) == 0) {
+ ll_init_map(&rth);
+ rtnl_close(&rth);
+ }
+ if ((id=ll_name_to_index(*argv)) <= 0) {
+ fprintf(stderr, "Illegal \"fromif\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_ROUTE4_IIF, &id, 4);
+ fh &= 0xFFFF;
+ fh |= (0x8000|id)<<16;
+ } else if (matches(*argv, "classid") == 0 ||
+ strcmp(*argv, "flowid") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_tc_classid(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"classid\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_ROUTE4_CLASSID, &handle, 4);
+ } else if (matches(*argv, "police") == 0) {
+ NEXT_ARG();
+ if (parse_police(&argc, &argv, TCA_ROUTE4_POLICE, n)) {
+ fprintf(stderr, "Illegal \"police\"\n");
+ return -1;
+ }
+ continue;
+ } else if (matches(*argv, "order") == 0) {
+ NEXT_ARG();
+ if (get_u32(&order, *argv, 0)) {
+ fprintf(stderr, "Illegal \"order\"\n");
+ return -1;
+ }
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+ tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
+ if (order) {
+ fh &= ~0x7F00;
+ fh |= (order<<8)&0x7F00;
+ }
+ if (!t->tcm_handle)
+ t->tcm_handle = fh;
+ return 0;
+}
+
+static int route_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle)
+{
+ struct rtattr *tb[TCA_ROUTE4_MAX+1];
+ SPRINT_BUF(b1);
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ if (opt)
+ parse_rtattr(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (handle)
+ fprintf(f, "fh 0x%08x ", handle);
+ if (handle&0x7F00)
+ fprintf(f, "order %d ", (handle>>8)&0x7F);
+
+ if (tb[TCA_ROUTE4_CLASSID]) {
+ SPRINT_BUF(b1);
+ fprintf(f, "flowid %s ", sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID]), b1));
+ }
+ if (tb[TCA_ROUTE4_TO])
+ fprintf(f, "to %s ", rtnl_rtrealm_n2a(*(__u32*)RTA_DATA(tb[TCA_ROUTE4_TO]), b1, sizeof(b1)));
+ if (tb[TCA_ROUTE4_FROM])
+ fprintf(f, "from %s ", rtnl_rtrealm_n2a(*(__u32*)RTA_DATA(tb[TCA_ROUTE4_FROM]), b1, sizeof(b1)));
+ if (tb[TCA_ROUTE4_IIF])
+ fprintf(f, "fromif %s", ll_index_to_name(*(int*)RTA_DATA(tb[TCA_ROUTE4_IIF])));
+ if (tb[TCA_ROUTE4_POLICE])
+ tc_print_police(f, tb[TCA_ROUTE4_POLICE]);
+ return 0;
+}
+
+struct filter_util route_util = {
+ NULL,
+ "route",
+ route_parse_opt,
+ route_print_opt,
+};
diff --git a/tc/f_rsvp.c b/tc/f_rsvp.c
index e69de29b..3d9b5283 100644
--- a/tc/f_rsvp.c
+++ b/tc/f_rsvp.c
@@ -0,0 +1,408 @@
+/*
+ * q_rsvp.c RSVP filter.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... rsvp ipproto PROTOCOL session DST[/PORT | GPI ]\n");
+ fprintf(stderr, " [ sender SRC[/PORT | GPI ]\n");
+ fprintf(stderr, " [ classid CLASSID ] [ police POLICE_SPEC ]\n");
+ fprintf(stderr, " [ tunnelid ID ] [ tunnel ID skip NUMBER ]\n");
+ fprintf(stderr, "Where: GPI := { flowlabel NUMBER | spi/ah SPI | spi/esp SPI |\n");
+ fprintf(stderr, " u{8|16|32} NUMBER mask MASK at OFFSET}\n");
+ fprintf(stderr, " POLICE_SPEC := ... look at TBF\n");
+ fprintf(stderr, " FILTERID := X:Y\n");
+}
+
+#define usage() return(-1)
+
+int get_addr_and_pi(int *argc_p, char ***argv_p, inet_prefix * addr,
+ struct tc_rsvp_pinfo *pinfo, int dir, int family)
+{
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ char *p = strchr(*argv, '/');
+ struct tc_rsvp_gpi *pi = dir ? &pinfo->dpi : &pinfo->spi;
+
+ if (p) {
+ __u16 tmp;
+
+ if (get_u16(&tmp, p+1, 0))
+ return -1;
+
+ if (dir == 0) {
+ /* Source port: u16 at offset 0 */
+ pi->key = htonl(((__u32)tmp)<<16);
+ pi->mask = htonl(0xFFFF0000);
+ } else {
+ /* Destination port: u16 at offset 2 */
+ pi->key = htonl(((__u32)tmp));
+ pi->mask = htonl(0x0000FFFF);
+ }
+ pi->offset = 0;
+ *p = 0;
+ }
+ if (get_addr_1(addr, *argv, family))
+ return -1;
+ if (p)
+ *p = '/';
+
+ argc--; argv++;
+
+ if (pi->mask || argc <= 0)
+ goto done;
+
+ if (strcmp(*argv, "spi/ah") == 0 ||
+ strcmp(*argv, "gpi/ah") == 0) {
+ __u32 gpi;
+ NEXT_ARG();
+ if (get_u32(&gpi, *argv, 0))
+ return -1;
+ pi->mask = htonl(0xFFFFFFFF);
+ pi->key = htonl(gpi);
+ pi->offset = 4;
+ if (pinfo->protocol == 0)
+ pinfo->protocol = IPPROTO_AH;
+ argc--; argv++;
+ } else if (strcmp(*argv, "spi/esp") == 0 ||
+ strcmp(*argv, "gpi/esp") == 0) {
+ __u32 gpi;
+ NEXT_ARG();
+ if (get_u32(&gpi, *argv, 0))
+ return -1;
+ pi->mask = htonl(0xFFFFFFFF);
+ pi->key = htonl(gpi);
+ pi->offset = 0;
+ if (pinfo->protocol == 0)
+ pinfo->protocol = IPPROTO_ESP;
+ argc--; argv++;
+ } else if (strcmp(*argv, "flowlabel") == 0) {
+ __u32 flabel;
+ NEXT_ARG();
+ if (get_u32(&flabel, *argv, 0))
+ return -1;
+ if (family != AF_INET6)
+ return -1;
+ pi->mask = htonl(0x000FFFFF);
+ pi->key = htonl(flabel) & pi->mask;
+ pi->offset = -40;
+ argc--; argv++;
+ } else if (strcmp(*argv, "u32") == 0 ||
+ strcmp(*argv, "u16") == 0 ||
+ strcmp(*argv, "u8") == 0) {
+ int sz = 1;
+ __u32 tmp;
+ __u32 mask = 0xff;
+ if (strcmp(*argv, "u32") == 0) {
+ sz = 4;
+ mask = 0xffff;
+ } else if (strcmp(*argv, "u16") == 0) {
+ mask = 0xffffffff;
+ sz = 2;
+ }
+ NEXT_ARG();
+ if (get_u32(&tmp, *argv, 0))
+ return -1;
+ argc--; argv++;
+ if (strcmp(*argv, "mask") == 0) {
+ NEXT_ARG();
+ if (get_u32(&mask, *argv, 16))
+ return -1;
+ argc--; argv++;
+ }
+ if (strcmp(*argv, "at") == 0) {
+ NEXT_ARG();
+ if (get_integer(&pi->offset, *argv, 0))
+ return -1;
+ argc--; argv++;
+ }
+ if (sz == 1) {
+ if ((pi->offset & 3) == 0) {
+ mask <<= 24;
+ tmp <<= 24;
+ } else if ((pi->offset & 3) == 1) {
+ mask <<= 16;
+ tmp <<= 16;
+ } else if ((pi->offset & 3) == 3) {
+ mask <<= 8;
+ tmp <<= 8;
+ }
+ } else if (sz == 2) {
+ if ((pi->offset & 3) == 0) {
+ mask <<= 16;
+ tmp <<= 16;
+ }
+ }
+ pi->offset &= ~3;
+ pi->mask = htonl(mask);
+ pi->key = htonl(tmp) & pi->mask;
+ }
+
+done:
+ *argc_p = argc;
+ *argv_p = argv;
+ return 0;
+}
+
+
+static int rsvp_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n)
+{
+ int family = strcmp(qu->id, "rsvp") == 0 ? AF_INET : AF_INET6;
+ struct tc_rsvp_pinfo pinfo;
+ struct tc_police tp;
+ struct tcmsg *t = NLMSG_DATA(n);
+ int pinfo_ok = 0;
+ struct rtattr *tail;
+
+ memset(&pinfo, 0, sizeof(pinfo));
+ memset(&tp, 0, sizeof(tp));
+
+ if (handle) {
+ if (get_u32(&t->tcm_handle, handle, 0)) {
+ fprintf(stderr, "Illegal \"handle\"\n");
+ return -1;
+ }
+ }
+
+ if (argc == 0)
+ return 0;
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 4096, TCA_OPTIONS, NULL, 0);
+
+ while (argc > 0) {
+ if (matches(*argv, "session") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ if (get_addr_and_pi(&argc, &argv, &addr, &pinfo, 1, family)) {
+ fprintf(stderr, "Illegal \"session\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_RSVP_DST, &addr.data, addr.bytelen);
+ if (pinfo.dpi.mask || pinfo.protocol)
+ pinfo_ok++;
+ continue;
+ } else if (matches(*argv, "sender") == 0 ||
+ matches(*argv, "flowspec") == 0) {
+ inet_prefix addr;
+ NEXT_ARG();
+ if (get_addr_and_pi(&argc, &argv, &addr, &pinfo, 0, family)) {
+ fprintf(stderr, "Illegal \"sender\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_RSVP_SRC, &addr.data, addr.bytelen);
+ if (pinfo.spi.mask || pinfo.protocol)
+ pinfo_ok++;
+ continue;
+ } else if (matches("ipproto", *argv) == 0) {
+ int num;
+ NEXT_ARG();
+ num = inet_proto_a2n(*argv);
+ if (num < 0) {
+ fprintf(stderr, "Illegal \"ipproto\"\n");
+ return -1;
+ }
+ pinfo.protocol = num;
+ pinfo_ok++;
+ } else if (matches(*argv, "classid") == 0 ||
+ strcmp(*argv, "flowid") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_tc_classid(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"classid\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_RSVP_CLASSID, &handle, 4);
+ } else if (strcmp(*argv, "tunnelid") == 0) {
+ unsigned tid;
+ NEXT_ARG();
+ if (get_unsigned(&tid, *argv, 0)) {
+ fprintf(stderr, "Illegal \"tunnelid\"\n");
+ return -1;
+ }
+ pinfo.tunnelid = tid;
+ pinfo_ok++;
+ } else if (strcmp(*argv, "tunnel") == 0) {
+ unsigned tid;
+ NEXT_ARG();
+ if (get_unsigned(&tid, *argv, 0)) {
+ fprintf(stderr, "Illegal \"tunnel\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_RSVP_CLASSID, &tid, 4);
+ NEXT_ARG();
+ if (strcmp(*argv, "skip") == 0) {
+ NEXT_ARG();
+ }
+ if (get_unsigned(&tid, *argv, 0)) {
+ fprintf(stderr, "Illegal \"skip\"\n");
+ return -1;
+ }
+ pinfo.tunnelhdr = tid;
+ pinfo_ok++;
+ } else if (matches(*argv, "police") == 0) {
+ NEXT_ARG();
+ if (parse_police(&argc, &argv, TCA_RSVP_POLICE, n)) {
+ fprintf(stderr, "Illegal \"police\"\n");
+ return -1;
+ }
+ continue;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (pinfo_ok)
+ addattr_l(n, 4096, TCA_RSVP_PINFO, &pinfo, sizeof(pinfo));
+ tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
+ return 0;
+}
+
+static char * sprint_spi(struct tc_rsvp_gpi *pi, int dir, char *buf)
+{
+ if (pi->offset == 0) {
+ if (dir && pi->mask == htonl(0xFFFF)) {
+ snprintf(buf, SPRINT_BSIZE-1, "/%d", htonl(pi->key));
+ return buf;
+ }
+ if (!dir && pi->mask == htonl(0xFFFF0000)) {
+ snprintf(buf, SPRINT_BSIZE-1, "/%d", htonl(pi->key)>>16);
+ return buf;
+ }
+ if (pi->mask == htonl(0xFFFFFFFF)) {
+ snprintf(buf, SPRINT_BSIZE-1, " spi/esp 0x%08x", htonl(pi->key));
+ return buf;
+ }
+ } else if (pi->offset == 4 && pi->mask == htonl(0xFFFFFFFF)) {
+ snprintf(buf, SPRINT_BSIZE-1, " spi/ah 0x%08x", htonl(pi->key));
+ return buf;
+ } else if (pi->offset == -40 && pi->mask == htonl(0x000FFFFF)) {
+ snprintf(buf, SPRINT_BSIZE-1, " flowlabel 0x%05x", htonl(pi->key));
+ return buf;
+ }
+ snprintf(buf, SPRINT_BSIZE-1, " u32 0x%08x mask %08x at %d",
+ htonl(pi->key), htonl(pi->mask), pi->offset);
+ return buf;
+}
+
+static int rsvp_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle)
+{
+ int family = strcmp(qu->id, "rsvp") == 0 ? AF_INET : AF_INET6;
+ struct rtattr *tb[TCA_RSVP_MAX+1];
+ struct tc_rsvp_pinfo *pinfo = NULL;
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ if (opt)
+ parse_rtattr(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (handle)
+ fprintf(f, "fh 0x%08x ", handle);
+
+ if (tb[TCA_RSVP_PINFO]) {
+ if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO]) < sizeof(*pinfo))
+ return -1;
+
+ pinfo = RTA_DATA(tb[TCA_RSVP_PINFO]);
+ }
+
+ if (tb[TCA_RSVP_CLASSID]) {
+ SPRINT_BUF(b1);
+ if (!pinfo || pinfo->tunnelhdr == 0)
+ fprintf(f, "flowid %s ", sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_RSVP_CLASSID]), b1));
+ else
+ fprintf(f, "tunnel %d skip %d ", *(__u32*)RTA_DATA(tb[TCA_RSVP_CLASSID]), pinfo->tunnelhdr);
+ } else if (pinfo && pinfo->tunnelhdr)
+ fprintf(f, "tunnel [BAD] skip %d ", pinfo->tunnelhdr);
+
+ if (tb[TCA_RSVP_DST]) {
+ char buf[128];
+ fprintf(f, "session ");
+ if (inet_ntop(family, RTA_DATA(tb[TCA_RSVP_DST]), buf, sizeof(buf)) == 0)
+ fprintf(f, " [INVALID DADDR] ");
+ else
+ fprintf(f, "%s", buf);
+ if (pinfo && pinfo->dpi.mask) {
+ SPRINT_BUF(b2);
+ fprintf(f, "%s ", sprint_spi(&pinfo->dpi, 1, b2));
+ } else
+ fprintf(f, " ");
+ } else {
+ if (pinfo && pinfo->dpi.mask) {
+ SPRINT_BUF(b2);
+ fprintf(f, "session [NONE]%s ", sprint_spi(&pinfo->dpi, 1, b2));
+ } else
+ fprintf(f, "session NONE ");
+ }
+
+ if (pinfo && pinfo->protocol) {
+ SPRINT_BUF(b1);
+ fprintf(f, "ipproto %s ", inet_proto_n2a(pinfo->protocol, b1, sizeof(b1)));
+ }
+ if (pinfo && pinfo->tunnelid)
+ fprintf(f, "tunnelid %d ", pinfo->tunnelid);
+ if (tb[TCA_RSVP_SRC]) {
+ char buf[128];
+ fprintf(f, "sender ");
+ if (inet_ntop(family, RTA_DATA(tb[TCA_RSVP_SRC]), buf, sizeof(buf)) == 0) {
+ fprintf(f, "[BAD]");
+ } else {
+ fprintf(f, " %s", buf);
+ }
+ if (pinfo && pinfo->spi.mask) {
+ SPRINT_BUF(b2);
+ fprintf(f, "%s ", sprint_spi(&pinfo->spi, 0, b2));
+ } else
+ fprintf(f, " ");
+ } else if (pinfo && pinfo->spi.mask) {
+ SPRINT_BUF(b2);
+ fprintf(f, "sender [NONE]%s ", sprint_spi(&pinfo->spi, 0, b2));
+ }
+ if (tb[TCA_RSVP_POLICE])
+ tc_print_police(f, tb[TCA_RSVP_POLICE]);
+ return 0;
+}
+
+struct filter_util rsvp_util = {
+ NULL,
+ "rsvp",
+ rsvp_parse_opt,
+ rsvp_print_opt,
+};
+
+struct filter_util rsvp6_util = {
+ NULL,
+ "rsvp6",
+ rsvp_parse_opt,
+ rsvp_print_opt,
+};
diff --git a/tc/f_tcindex.c b/tc/f_tcindex.c
index e69de29b..59397487 100644
--- a/tc/f_tcindex.c
+++ b/tc/f_tcindex.c
@@ -0,0 +1,186 @@
+/*
+ * f_tcindex.c Traffic control index filter
+ *
+ * Written 1998,1999 by Werner Almesberger
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <string.h>
+#include <netinet/in.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr," Usage: ... tcindex [ hash SIZE ] [ mask MASK ]"
+ " [ shift SHIFT ]\n");
+ fprintf(stderr," [ pass_on | fall_through ]\n");
+ fprintf(stderr," [ classid CLASSID ] "
+ "[ police POLICE_SPEC ]\n");
+}
+
+
+#define usage() return(-1)
+
+
+static int tcindex_parse_opt(struct filter_util *qu, char *handle, int argc,
+ char **argv, struct nlmsghdr *n)
+{
+ struct tcmsg *t = NLMSG_DATA(n);
+ struct rtattr *tail;
+ char *end;
+
+ if (handle) {
+ t->tcm_handle = strtoul(handle,&end,0);
+ if (*end) {
+ fprintf(stderr, "Illegal filter ID\n");
+ return -1;
+ }
+ }
+ if (!argc) return 0;
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n,4096,TCA_OPTIONS,NULL,0);
+ while (argc) {
+ if (!strcmp(*argv,"hash")) {
+ int hash;
+
+ NEXT_ARG();
+ hash = strtoul(*argv,&end,0);
+ if (*end || !hash || hash > 0x10000) {
+ explain();
+ return -1;
+ }
+ addattr_l(n,4096,TCA_TCINDEX_HASH,&hash,sizeof(hash));
+ }
+ else if (!strcmp(*argv,"mask")) {
+ __u16 mask;
+
+ NEXT_ARG();
+ mask = strtoul(*argv,&end,0);
+ if (*end) {
+ explain();
+ return -1;
+ }
+ addattr_l(n,4096,TCA_TCINDEX_MASK,&mask,sizeof(mask));
+ }
+ else if (!strcmp(*argv,"shift")) {
+ int shift;
+
+ NEXT_ARG();
+ shift = strtoul(*argv,&end,0);
+ if (*end) {
+ explain();
+ return -1;
+ }
+ addattr_l(n,4096,TCA_TCINDEX_SHIFT,&shift,
+ sizeof(shift));
+ }
+ else if (!strcmp(*argv,"fall_through")) {
+ int value = 1;
+
+ addattr_l(n,4096,TCA_TCINDEX_FALL_THROUGH,&value,
+ sizeof(value));
+ }
+ else if (!strcmp(*argv,"pass_on")) {
+ int value = 0;
+
+ addattr_l(n,4096,TCA_TCINDEX_FALL_THROUGH,&value,
+ sizeof(value));
+ }
+ else if (!strcmp(*argv,"classid")) {
+ __u32 handle;
+
+ NEXT_ARG();
+ if (get_tc_classid(&handle,*argv)) {
+ fprintf(stderr, "Illegal \"classid\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_TCINDEX_CLASSID, &handle, 4);
+ }
+ else if (!strcmp(*argv,"police")) {
+ NEXT_ARG();
+ if (parse_police(&argc, &argv, TCA_TCINDEX_POLICE, n)) {
+ fprintf(stderr, "Illegal \"police\"\n");
+ return -1;
+ }
+ continue;
+ }
+ else {
+ explain();
+ return -1;
+ }
+ argc--;
+ argv++;
+ }
+ tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
+ return 0;
+}
+
+
+static int tcindex_print_opt(struct filter_util *qu, FILE *f,
+ struct rtattr *opt, __u32 handle)
+{
+ struct rtattr *tb[TCA_TCINDEX_MAX+1];
+
+ if (!opt) return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_TCINDEX_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (handle != ~0) fprintf(f,"handle 0x%04x ",handle);
+ if (tb[TCA_TCINDEX_HASH]) {
+ __u16 hash;
+
+ if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH]) < sizeof(hash))
+ return -1;
+ hash = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_HASH]);
+ fprintf(f,"hash %d ",hash);
+ }
+ if (tb[TCA_TCINDEX_MASK]) {
+ __u16 mask;
+
+ if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK]) < sizeof(mask))
+ return -1;
+ mask = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK]);
+ fprintf(f,"mask 0x%04x ",mask);
+ }
+ if (tb[TCA_TCINDEX_SHIFT]) {
+ int shift;
+
+ if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT]) < sizeof(shift))
+ return -1;
+ shift = *(int *) RTA_DATA(tb[TCA_TCINDEX_SHIFT]);
+ fprintf(f,"shift %d ",shift);
+ }
+ if (tb[TCA_TCINDEX_FALL_THROUGH]) {
+ int fall_through;
+
+ if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH]) <
+ sizeof(fall_through))
+ return -1;
+ fall_through = *(int *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH]);
+ fprintf(f,fall_through ? "fall_through " : "pass_on ");
+ }
+ if (tb[TCA_TCINDEX_CLASSID]) {
+ SPRINT_BUF(b1);
+ fprintf(f, "classid %s ",sprint_tc_classid(*(__u32 *)
+ RTA_DATA(tb[TCA_TCINDEX_CLASSID]), b1));
+ }
+ if (tb[TCA_TCINDEX_POLICE]) {
+ fprintf(f, "\n");
+ tc_print_police(f, tb[TCA_TCINDEX_POLICE]);
+ }
+ return 0;
+}
+
+struct filter_util tcindex_util = {
+ NULL,
+ "tcindex",
+ tcindex_parse_opt,
+ tcindex_print_opt,
+};
diff --git a/tc/f_u32.c b/tc/f_u32.c
index e69de29b..3e76e9cf 100644
--- a/tc/f_u32.c
+++ b/tc/f_u32.c
@@ -0,0 +1,977 @@
+/*
+ * q_u32.c U32 filter.
+ *
+ * This program is free software; you can u32istribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... u32 [ match SELECTOR ... ] [ link HTID ] [ classid CLASSID ]\n");
+ fprintf(stderr, " [ police POLICE_SPEC ] [ offset OFFSET_SPEC ]\n");
+ fprintf(stderr, " [ ht HTID ] [ hashkey HASHKEY_SPEC ]\n");
+ fprintf(stderr, " [ sample SAMPLE ]\n");
+ fprintf(stderr, "or u32 divisor DIVISOR\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Where: SELECTOR := SAMPLE SAMPLE ...\n");
+ fprintf(stderr, " SAMPLE := { ip | ip6 | udp | tcp | icmp | u{32|16|8} } SAMPLE_ARGS\n");
+ fprintf(stderr, " FILTERID := X:Y:Z\n");
+}
+
+#define usage() return(-1)
+
+int get_u32_handle(__u32 *handle, char *str)
+{
+ __u32 htid=0, hash=0, nodeid=0;
+ char *tmp = strchr(str, ':');
+
+ if (tmp == NULL) {
+ if (memcmp("0x", str, 2) == 0)
+ return get_u32(handle, str, 16);
+ return -1;
+ }
+ htid = strtoul(str, &tmp, 16);
+ if (tmp == str && *str != ':' && *str != 0)
+ return -1;
+ if (htid>=0x1000)
+ return -1;
+ if (*tmp) {
+ str = tmp+1;
+ hash = strtoul(str, &tmp, 16);
+ if (tmp == str && *str != ':' && *str != 0)
+ return -1;
+ if (hash>=0x100)
+ return -1;
+ if (*tmp) {
+ str = tmp+1;
+ nodeid = strtoul(str, &tmp, 16);
+ if (tmp == str && *str != 0)
+ return -1;
+ if (nodeid>=0x1000)
+ return -1;
+ }
+ }
+ *handle = (htid<<20)|(hash<<12)|nodeid;
+ return 0;
+}
+
+char * sprint_u32_handle(__u32 handle, char *buf)
+{
+ int bsize = SPRINT_BSIZE-1;
+ __u32 htid = TC_U32_HTID(handle);
+ __u32 hash = TC_U32_HASH(handle);
+ __u32 nodeid = TC_U32_NODE(handle);
+ char *b = buf;
+
+ if (handle == 0) {
+ snprintf(b, bsize, "none");
+ return b;
+ }
+ if (htid) {
+ int l = snprintf(b, bsize, "%x:", htid>>20);
+ bsize -= l;
+ b += l;
+ }
+ if (nodeid|hash) {
+ if (hash) {
+ int l = snprintf(b, bsize, "%x", hash);
+ bsize -= l;
+ b += l;
+ }
+ if (nodeid) {
+ int l = snprintf(b, bsize, ":%x", nodeid);
+ bsize -= l;
+ b += l;
+ }
+ }
+ if (show_raw)
+ snprintf(b, bsize, "[%08x] ", handle);
+ return buf;
+}
+
+static int pack_key(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask)
+{
+ int i;
+ int hwm = sel->nkeys;
+
+ key &= mask;
+
+ for (i=0; i<hwm; i++) {
+ if (sel->keys[i].off == off && sel->keys[i].offmask == offmask) {
+ __u32 intersect = mask&sel->keys[i].mask;
+
+ if ((key^sel->keys[i].val) & intersect)
+ return -1;
+ sel->keys[i].val |= key;
+ sel->keys[i].mask |= mask;
+ return 0;
+ }
+ }
+
+ if (hwm >= 128)
+ return -1;
+ if (off % 4)
+ return -1;
+ sel->keys[hwm].val = key;
+ sel->keys[hwm].mask = mask;
+ sel->keys[hwm].off = off;
+ sel->keys[hwm].offmask = offmask;
+ sel->nkeys++;
+ return 0;
+}
+
+static int pack_key32(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask)
+{
+ key = htonl(key);
+ mask = htonl(mask);
+ return pack_key(sel, key, mask, off, offmask);
+}
+
+static int pack_key16(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask)
+{
+ if (key > 0xFFFF || mask > 0xFFFF)
+ return -1;
+
+ if ((off & 3) == 0) {
+ key <<= 16;
+ mask <<= 16;
+ }
+ off &= ~3;
+ key = htonl(key);
+ mask = htonl(mask);
+
+ return pack_key(sel, key, mask, off, offmask);
+}
+
+static int pack_key8(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask)
+{
+ if (key > 0xFF || mask > 0xFF)
+ return -1;
+
+ if ((off & 3) == 0) {
+ key <<= 24;
+ mask <<= 24;
+ } else if ((off & 3) == 1) {
+ key <<= 16;
+ mask <<= 16;
+ } else if ((off & 3) == 2) {
+ key <<= 8;
+ mask <<= 8;
+ }
+ off &= ~3;
+ key = htonl(key);
+ mask = htonl(mask);
+
+ return pack_key(sel, key, mask, off, offmask);
+}
+
+
+int parse_at(int *argc_p, char ***argv_p, int *off, int *offmask)
+{
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ char *p = *argv;
+
+ if (argc <= 0)
+ return -1;
+
+ if (strlen(p) > strlen("nexthdr+") &&
+ memcmp(p, "nexthdr+", strlen("nexthdr+")) == 0) {
+ *offmask = -1;
+ p += strlen("nexthdr+");
+ } else if (matches(*argv, "nexthdr+") == 0) {
+ NEXT_ARG();
+ *offmask = -1;
+ p = *argv;
+ }
+
+ if (get_integer(off, p, 0))
+ return -1;
+ argc--; argv++;
+
+ *argc_p = argc;
+ *argv_p = argv;
+ return 0;
+}
+
+
+static int parse_u32(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off, int offmask)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ __u32 key;
+ __u32 mask;
+
+ if (argc < 2)
+ return -1;
+
+ if (get_u32(&key, *argv, 0))
+ return -1;
+ argc--; argv++;
+
+ if (get_u32(&mask, *argv, 16))
+ return -1;
+ argc--; argv++;
+
+ if (argc > 0 && strcmp(argv[0], "at") == 0) {
+ NEXT_ARG();
+ if (parse_at(&argc, &argv, &off, &offmask))
+ return -1;
+ }
+
+ res = pack_key32(sel, key, mask, off, offmask);
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_u16(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off, int offmask)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ __u32 key;
+ __u32 mask;
+
+ if (argc < 2)
+ return -1;
+
+ if (get_u32(&key, *argv, 0))
+ return -1;
+ argc--; argv++;
+
+ if (get_u32(&mask, *argv, 16))
+ return -1;
+ argc--; argv++;
+
+ if (argc > 0 && strcmp(argv[0], "at") == 0) {
+ NEXT_ARG();
+ if (parse_at(&argc, &argv, &off, &offmask))
+ return -1;
+ }
+ res = pack_key16(sel, key, mask, off, offmask);
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_u8(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off, int offmask)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ __u32 key;
+ __u32 mask;
+
+ if (argc < 2)
+ return -1;
+
+ if (get_u32(&key, *argv, 0))
+ return -1;
+ argc--; argv++;
+
+ if (get_u32(&mask, *argv, 16))
+ return -1;
+ argc--; argv++;
+
+ if (key > 0xFF || mask > 0xFF)
+ return -1;
+
+ if (argc > 0 && strcmp(argv[0], "at") == 0) {
+ NEXT_ARG();
+ if (parse_at(&argc, &argv, &off, &offmask))
+ return -1;
+ }
+
+ res = pack_key8(sel, key, mask, off, offmask);
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_ip_addr(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ inet_prefix addr;
+ __u32 mask;
+ int offmask = 0;
+
+ if (argc < 1)
+ return -1;
+
+ if (get_prefix_1(&addr, *argv, AF_INET))
+ return -1;
+ argc--; argv++;
+
+ if (argc > 0 && strcmp(argv[0], "at") == 0) {
+ NEXT_ARG();
+ if (parse_at(&argc, &argv, &off, &offmask))
+ return -1;
+ }
+
+ mask = 0;
+ if (addr.bitlen)
+ mask = htonl(0xFFFFFFFF<<(32-addr.bitlen));
+ if (pack_key(sel, addr.data[0], mask, off, offmask) < 0)
+ return -1;
+ res = 0;
+
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_ip6_addr(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ int plen = 128;
+ int i;
+ inet_prefix addr;
+ int offmask = 0;
+
+ if (argc < 1)
+ return -1;
+
+ if (get_prefix_1(&addr, *argv, AF_INET6))
+ return -1;
+ argc--; argv++;
+
+ if (argc > 0 && strcmp(argv[0], "at") == 0) {
+ NEXT_ARG();
+ if (parse_at(&argc, &argv, &off, &offmask))
+ return -1;
+ }
+
+ plen = addr.bitlen;
+ for (i=0; i<plen; i+=32) {
+ if (((i+31)&~0x1F)<=plen) {
+ if ((res = pack_key(sel, addr.data[i/32], 0xFFFFFFFF, off+4*(i/32), offmask)) < 0)
+ return -1;
+ } else if (i<plen) {
+ __u32 mask = htonl(0xFFFFFFFF<<(32-(plen-i)));
+ if ((res = pack_key(sel, addr.data[i/32], mask, off+4*(i/32), offmask)) < 0)
+ return -1;
+ }
+ }
+ res = 0;
+
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_ip(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+
+ if (argc < 2)
+ return -1;
+
+ if (strcmp(*argv, "src") == 0) {
+ NEXT_ARG();
+ res = parse_ip_addr(&argc, &argv, sel, 12);
+ goto done;
+ }
+ if (strcmp(*argv, "dst") == 0) {
+ NEXT_ARG();
+ res = parse_ip_addr(&argc, &argv, sel, 16);
+ goto done;
+ }
+ if (strcmp(*argv, "tos") == 0 ||
+ matches(*argv, "dsfield") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 1, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "ihl") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 0, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "protocol") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 9, 0);
+ goto done;
+ }
+ if (matches(*argv, "precedence") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 1, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "nofrag") == 0) {
+ argc--; argv++;
+ res = pack_key16(sel, 0, 0x3FFF, 6, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "firstfrag") == 0) {
+ argc--; argv++;
+ res = pack_key16(sel, 0, 0x1FFF, 6, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "df") == 0) {
+ argc--; argv++;
+ res = pack_key16(sel, 0x4000, 0x4000, 6, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "mf") == 0) {
+ argc--; argv++;
+ res = pack_key16(sel, 0x2000, 0x2000, 6, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "dport") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 22, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "sport") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 20, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "icmp_type") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 20, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "icmp_code") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 20, 1);
+ goto done;
+ }
+ return -1;
+
+done:
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_ip6(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+
+ if (argc < 2)
+ return -1;
+
+ if (strcmp(*argv, "src") == 0) {
+ NEXT_ARG();
+ res = parse_ip6_addr(&argc, &argv, sel, 8);
+ goto done;
+ }
+ if (strcmp(*argv, "dst") == 0) {
+ NEXT_ARG();
+ res = parse_ip6_addr(&argc, &argv, sel, 24);
+ goto done;
+ }
+ if (strcmp(*argv, "priority") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 0, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "protocol") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 6, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "flowlabel") == 0) {
+ NEXT_ARG();
+ res = parse_u32(&argc, &argv, sel, 0, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "dport") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 42, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "sport") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 40, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "icmp_type") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 40, 0);
+ goto done;
+ }
+ if (strcmp(*argv, "icmp_code") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 41, 1);
+ goto done;
+ }
+ return -1;
+
+done:
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+#define parse_tcp parse_udp
+static int parse_udp(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+
+ if (argc < 2)
+ return -1;
+
+ if (strcmp(*argv, "src") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 0, -1);
+ goto done;
+ }
+ if (strcmp(*argv, "dst") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 2, -1);
+ goto done;
+ }
+ return -1;
+
+done:
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_icmp(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int res = -1;
+ int argc = *argc_p;
+ char **argv = *argv_p;
+
+ if (argc < 2)
+ return -1;
+
+ if (strcmp(*argv, "type") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 0, -1);
+ goto done;
+ }
+ if (strcmp(*argv, "code") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 1, -1);
+ goto done;
+ }
+ return -1;
+
+done:
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+
+
+static int parse_selector(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ int res = -1;
+
+ if (argc <= 0)
+ return -1;
+
+ if (matches(*argv, "u32") == 0) {
+ NEXT_ARG();
+ res = parse_u32(&argc, &argv, sel, 0, 0);
+ goto done;
+ }
+ if (matches(*argv, "u16") == 0) {
+ NEXT_ARG();
+ res = parse_u16(&argc, &argv, sel, 0, 0);
+ goto done;
+ }
+ if (matches(*argv, "u8") == 0) {
+ NEXT_ARG();
+ res = parse_u8(&argc, &argv, sel, 0, 0);
+ goto done;
+ }
+ if (matches(*argv, "ip") == 0) {
+ NEXT_ARG();
+ res = parse_ip(&argc, &argv, sel);
+ goto done;
+ }
+ if (matches(*argv, "ip6") == 0) {
+ NEXT_ARG();
+ res = parse_ip6(&argc, &argv, sel);
+ goto done;
+ }
+ if (matches(*argv, "udp") == 0) {
+ NEXT_ARG();
+ res = parse_udp(&argc, &argv, sel);
+ goto done;
+ }
+ if (matches(*argv, "tcp") == 0) {
+ NEXT_ARG();
+ res = parse_tcp(&argc, &argv, sel);
+ goto done;
+ }
+ if (matches(*argv, "icmp") == 0) {
+ NEXT_ARG();
+ res = parse_icmp(&argc, &argv, sel);
+ goto done;
+ }
+ return -1;
+
+done:
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+static int parse_offset(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int argc = *argc_p;
+ char **argv = *argv_p;
+
+ while (argc > 0) {
+ if (matches(*argv, "plus") == 0) {
+ int off;
+ NEXT_ARG();
+ if (get_integer(&off, *argv, 0))
+ return -1;
+ sel->off = off;
+ sel->flags |= TC_U32_OFFSET;
+ } else if (matches(*argv, "at") == 0) {
+ int off;
+ NEXT_ARG();
+ if (get_integer(&off, *argv, 0))
+ return -1;
+ sel->offoff = off;
+ if (off%2) {
+ fprintf(stderr, "offset \"at\" must be even\n");
+ return -1;
+ }
+ sel->flags |= TC_U32_VAROFFSET;
+ } else if (matches(*argv, "mask") == 0) {
+ __u16 mask;
+ NEXT_ARG();
+ if (get_u16(&mask, *argv, 16))
+ return -1;
+ sel->offmask = htons(mask);
+ sel->flags |= TC_U32_VAROFFSET;
+ } else if (matches(*argv, "shift") == 0) {
+ int shift;
+ NEXT_ARG();
+ if (get_integer(&shift, *argv, 0))
+ return -1;
+ sel->offshift = shift;
+ sel->flags |= TC_U32_VAROFFSET;
+ } else if (matches(*argv, "eat") == 0) {
+ sel->flags |= TC_U32_EAT;
+ } else {
+ break;
+ }
+ argc--; argv++;
+ }
+
+ *argc_p = argc;
+ *argv_p = argv;
+ return 0;
+}
+
+static int parse_hashkey(int *argc_p, char ***argv_p, struct tc_u32_sel *sel)
+{
+ int argc = *argc_p;
+ char **argv = *argv_p;
+
+ while (argc > 0) {
+ if (matches(*argv, "mask") == 0) {
+ __u32 mask;
+ NEXT_ARG();
+ if (get_u32(&mask, *argv, 16))
+ return -1;
+ sel->hmask = htonl(mask);
+ } else if (matches(*argv, "at") == 0) {
+ int num;
+ NEXT_ARG();
+ if (get_integer(&num, *argv, 0))
+ return -1;
+ if (num%4)
+ return -1;
+ sel->hoff = num;
+ } else {
+ break;
+ }
+ argc--; argv++;
+ }
+
+ *argc_p = argc;
+ *argv_p = argv;
+ return 0;
+}
+
+static int u32_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n)
+{
+ struct {
+ struct tc_u32_sel sel;
+ struct tc_u32_key keys[128];
+ } sel;
+ struct tcmsg *t = NLMSG_DATA(n);
+ struct rtattr *tail;
+ int sel_ok = 0;
+ int sample_ok = 0;
+ __u32 htid = 0;
+ __u32 order = 0;
+
+ memset(&sel, 0, sizeof(sel));
+
+ if (handle && get_u32_handle(&t->tcm_handle, handle)) {
+ fprintf(stderr, "Illegal filter ID\n");
+ return -1;
+ }
+
+ if (argc == 0)
+ return 0;
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 4096, TCA_OPTIONS, NULL, 0);
+
+ while (argc > 0) {
+ if (matches(*argv, "match") == 0) {
+ NEXT_ARG();
+ if (parse_selector(&argc, &argv, &sel.sel)) {
+ fprintf(stderr, "Illegal \"match\"\n");
+ return -1;
+ }
+ sel_ok++;
+ continue;
+ } else if (matches(*argv, "offset") == 0) {
+ NEXT_ARG();
+ if (parse_offset(&argc, &argv, &sel.sel)) {
+ fprintf(stderr, "Illegal \"offset\"\n");
+ return -1;
+ }
+ continue;
+ } else if (matches(*argv, "hashkey") == 0) {
+ NEXT_ARG();
+ if (parse_hashkey(&argc, &argv, &sel.sel)) {
+ fprintf(stderr, "Illegal \"hashkey\"\n");
+ return -1;
+ }
+ continue;
+ } else if (matches(*argv, "classid") == 0 ||
+ strcmp(*argv, "flowid") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_tc_classid(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"classid\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_U32_CLASSID, &handle, 4);
+ sel.sel.flags |= TC_U32_TERMINAL;
+ } else if (matches(*argv, "divisor") == 0) {
+ unsigned divisor;
+ NEXT_ARG();
+ if (get_unsigned(&divisor, *argv, 0) || divisor == 0 ||
+ divisor > 0x100) {
+ fprintf(stderr, "Illegal \"divisor\"\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_U32_DIVISOR, &divisor, 4);
+ } else if (matches(*argv, "order") == 0) {
+ NEXT_ARG();
+ if (get_u32(&order, *argv, 0)) {
+ fprintf(stderr, "Illegal \"order\"\n");
+ return -1;
+ }
+ } else if (strcmp(*argv, "link") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_u32_handle(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"link\"\n");
+ return -1;
+ }
+ if (handle && TC_U32_NODE(handle)) {
+ fprintf(stderr, "\"link\" must be a hash table.\n");
+ return -1;
+ }
+ addattr_l(n, 4096, TCA_U32_LINK, &handle, 4);
+ } else if (strcmp(*argv, "ht") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_u32_handle(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"ht\"\n");
+ return -1;
+ }
+ if (handle && TC_U32_NODE(handle)) {
+ fprintf(stderr, "\"ht\" must be a hash table.\n");
+ return -1;
+ }
+ if (sample_ok)
+ htid = (htid&0xFF000)|(handle&0xFFF00000);
+ else
+ htid = (handle&0xFFFFF000);
+ } else if (strcmp(*argv, "sample") == 0) {
+ __u32 hash;
+ struct {
+ struct tc_u32_sel sel;
+ struct tc_u32_key keys[4];
+ } sel2;
+ NEXT_ARG();
+ if (parse_selector(&argc, &argv, &sel2.sel)) {
+ fprintf(stderr, "Illegal \"sample\"\n");
+ return -1;
+ }
+ if (sel2.sel.nkeys != 1) {
+ fprintf(stderr, "\"sample\" must contain exactly ONE key.\n");
+ return -1;
+ }
+ hash = sel2.sel.keys[0].val&sel2.sel.keys[0].mask;
+ hash ^= hash>>16;
+ hash ^= hash>>8;
+ htid = ((hash<<12)&0xFF000)|(htid&0xFFF00000);
+ sample_ok = 1;
+ continue;
+ } else if (matches(*argv, "police") == 0) {
+ NEXT_ARG();
+ if (parse_police(&argc, &argv, TCA_U32_POLICE, n)) {
+ fprintf(stderr, "Illegal \"police\"\n");
+ return -1;
+ }
+ continue;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (order) {
+ if (TC_U32_NODE(t->tcm_handle) && order != TC_U32_NODE(t->tcm_handle)) {
+ fprintf(stderr, "\"order\" contradicts \"handle\"\n");
+ return -1;
+ }
+ t->tcm_handle |= order;
+ }
+
+ if (htid)
+ addattr_l(n, 4096, TCA_U32_HASH, &htid, 4);
+ if (sel_ok)
+ addattr_l(n, 4096, TCA_U32_SEL, &sel, sizeof(sel.sel)+sel.sel.nkeys*sizeof(struct tc_u32_key));
+ tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
+ return 0;
+}
+
+static int u32_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle)
+{
+ struct rtattr *tb[TCA_U32_MAX+1];
+ struct tc_u32_sel *sel = NULL;
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ if (opt)
+ parse_rtattr(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (handle) {
+ SPRINT_BUF(b1);
+ fprintf(f, "fh %s ", sprint_u32_handle(handle, b1));
+ }
+ if (TC_U32_NODE(handle)) {
+ fprintf(f, "order %d ", TC_U32_NODE(handle));
+ }
+
+ if (tb[TCA_U32_SEL]) {
+ if (RTA_PAYLOAD(tb[TCA_U32_SEL]) < sizeof(*sel))
+ return -1;
+
+ sel = RTA_DATA(tb[TCA_U32_SEL]);
+ }
+
+ if (tb[TCA_U32_DIVISOR]) {
+ fprintf(f, "ht divisor %d ", *(__u32*)RTA_DATA(tb[TCA_U32_DIVISOR]));
+ } else if (tb[TCA_U32_HASH]) {
+ __u32 htid = *(__u32*)RTA_DATA(tb[TCA_U32_HASH]);
+ fprintf(f, "key ht %x bkt %x ", TC_U32_USERHTID(htid), TC_U32_HASH(htid));
+ } else {
+ fprintf(f, "??? ");
+ }
+ if (tb[TCA_U32_CLASSID]) {
+ SPRINT_BUF(b1);
+ fprintf(f, "%sflowid %s ",
+ !sel || !(sel->flags&TC_U32_TERMINAL) ? "*" : "",
+ sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_U32_CLASSID]), b1));
+ } else if (sel && sel->flags&TC_U32_TERMINAL) {
+ fprintf(f, "terminal flowid ??? ");
+ }
+ if (tb[TCA_U32_LINK]) {
+ SPRINT_BUF(b1);
+ fprintf(f, "link %s ", sprint_u32_handle(*(__u32*)RTA_DATA(tb[TCA_U32_LINK]), b1));
+ }
+ if (tb[TCA_U32_POLICE]) {
+ fprintf(f, "\n");
+ tc_print_police(f, tb[TCA_U32_POLICE]);
+ }
+
+ if (sel) {
+ int i;
+ struct tc_u32_key *key = sel->keys;
+
+ if (sel->nkeys) {
+ for (i=0; i<sel->nkeys; i++, key++)
+ fprintf(f, "\n match %08x/%08x at %s%d",
+ (unsigned int)ntohl(key->val),
+ (unsigned int)ntohl(key->mask),
+ key->offmask ? "nexthdr+" : "",
+ key->off);
+ }
+
+ if (sel->flags&(TC_U32_VAROFFSET|TC_U32_OFFSET)) {
+ fprintf(f, "\n offset ");
+ if (sel->flags&TC_U32_VAROFFSET)
+ fprintf(f, "%04x>>%d at %d ", ntohs(sel->offmask), sel->offshift, sel->offoff);
+ if (sel->off)
+ fprintf(f, "plus %d ", sel->off);
+ }
+ if (sel->flags&TC_U32_EAT)
+ fprintf(f, " eat ");
+
+ if (sel->hmask) {
+ fprintf(f, "\n hash mask %08x at %d ",
+ (unsigned int)htonl(sel->hmask), sel->hoff);
+ }
+ }
+
+ return 0;
+}
+
+struct filter_util u32_util = {
+ NULL,
+ "u32",
+ u32_parse_opt,
+ u32_print_opt,
+};
diff --git a/tc/m_estimator.c b/tc/m_estimator.c
index e69de29b..0f9808e5 100644
--- a/tc/m_estimator.c
+++ b/tc/m_estimator.c
@@ -0,0 +1,64 @@
+/*
+ * m_estimator.c Parse/print estimator module options.
+ *
+ * This program is free software; you can u32istribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void est_help(void) __attribute__((noreturn));
+
+static void est_help(void)
+{
+ fprintf(stderr, "Usage: ... estimator INTERVAL TIME-CONST\n");
+ fprintf(stderr, " INTERVAL is interval between measurements\n");
+ fprintf(stderr, " TIME-CONST is averaging time constant\n");
+ fprintf(stderr, "Example: ... est 1sec 8sec\n");
+ exit(-1);
+}
+
+int parse_estimator(int *p_argc, char ***p_argv, struct tc_estimator *est)
+{
+ int argc = *p_argc;
+ char **argv = *p_argv;
+ unsigned A, time_const;
+
+ NEXT_ARG();
+ if (est->ewma_log)
+ duparg("estimator", *argv);
+ if (matches(*argv, "help") == 0)
+ est_help();
+ if (get_usecs(&A, *argv))
+ invarg("estimator", "invalid estimator interval");
+ NEXT_ARG();
+ if (matches(*argv, "help") == 0)
+ est_help();
+ if (get_usecs(&time_const, *argv))
+ invarg("estimator", "invalid estimator time constant");
+ if (tc_setup_estimator(A, time_const, est) < 0) {
+ fprintf(stderr, "Error: estimator parameters are out of range.\n");
+ exit(-1);
+ }
+ if (show_raw)
+ fprintf(stderr, "[estimator i=%u e=%u]\n", est->interval, est->ewma_log);
+ *p_argc = argc;
+ *p_argv = argv;
+ return 0;
+}
diff --git a/tc/m_police.c b/tc/m_police.c
index e69de29b..0e76efc5 100644
--- a/tc/m_police.c
+++ b/tc/m_police.c
@@ -0,0 +1,328 @@
+/*
+ * m_police.c Parse/print policing module options.
+ *
+ * This program is free software; you can u32istribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * FIXES: 19990619 - J Hadi Salim (hadi@cyberus.ca)
+ * simple addattr packaging fix.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... police rate BPS burst BYTES[/BYTES] [ mtu BYTES[/BYTES] ]\n");
+ fprintf(stderr, " [ peakrate BPS ] [ avrate BPS ]\n");
+ fprintf(stderr, " [ ACTION ]\n");
+ fprintf(stderr, "Where: ACTION := reclassify | drop | continue \n");
+}
+
+static void explain1(char *arg)
+{
+ fprintf(stderr, "Illegal \"%s\"\n", arg);
+}
+
+#define usage() return(-1)
+
+
+char *police_action_n2a(int action, char *buf, int len)
+{
+ switch (action) {
+ case -1:
+ return "continue";
+ break;
+ case TC_POLICE_OK:
+ return "pass";
+ break;
+ case TC_POLICE_SHOT:
+ return "drop";
+ break;
+ case TC_POLICE_RECLASSIFY:
+ return "reclassify";
+ default:
+ snprintf(buf, len, "%d", action);
+ return buf;
+ }
+}
+
+int police_action_a2n(char *arg, int *result)
+{
+ int res;
+
+ if (matches(arg, "continue") == 0)
+ res = -1;
+ else if (matches(arg, "drop") == 0)
+ res = TC_POLICE_SHOT;
+ else if (matches(arg, "shot") == 0)
+ res = TC_POLICE_SHOT;
+ else if (matches(arg, "pass") == 0)
+ res = TC_POLICE_OK;
+ else if (strcmp(arg, "ok") == 0)
+ res = TC_POLICE_OK;
+ else if (matches(arg, "reclassify") == 0)
+ res = TC_POLICE_RECLASSIFY;
+ else {
+ char dummy;
+ if (sscanf(arg, "%d%c", &res, &dummy) != 1)
+ return -1;
+ }
+ *result = res;
+ return 0;
+}
+
+
+int get_police_result(int *action, int *result, char *arg)
+{
+ char *p = strchr(arg, '/');
+
+ if (p)
+ *p = 0;
+
+ if (police_action_a2n(arg, action)) {
+ if (p)
+ *p = '/';
+ return -1;
+ }
+
+ if (p) {
+ *p = '/';
+ if (police_action_a2n(p+1, result))
+ return -1;
+ }
+ return 0;
+}
+
+int parse_police(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n)
+{
+ int argc = *argc_p;
+ char **argv = *argv_p;
+ int res = -1;
+ int ok=0;
+ struct tc_police p;
+ __u32 rtab[256];
+ __u32 ptab[256];
+ __u32 avrate = 0;
+ int presult = 0;
+ unsigned buffer=0, mtu=0, mpu=0;
+ int Rcell_log=-1, Pcell_log = -1;
+ struct rtattr *tail;
+
+ memset(&p, 0, sizeof(p));
+ p.action = TC_POLICE_RECLASSIFY;
+
+ if (argc <= 0)
+ return -1;
+
+ while (argc > 0) {
+ if (matches(*argv, "index") == 0) {
+ NEXT_ARG();
+ if (get_u32(&p.index, *argv, 16)) {
+ fprintf(stderr, "Illegal \"index\"\n");
+ return -1;
+ }
+ } else if (matches(*argv, "burst") == 0 ||
+ strcmp(*argv, "buffer") == 0 ||
+ strcmp(*argv, "maxburst") == 0) {
+ NEXT_ARG();
+ if (buffer) {
+ fprintf(stderr, "Double \"buffer/burst\" spec\n");
+ return -1;
+ }
+ if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0) {
+ explain1("buffer");
+ return -1;
+ }
+ } else if (strcmp(*argv, "mtu") == 0 ||
+ strcmp(*argv, "minburst") == 0) {
+ NEXT_ARG();
+ if (mtu) {
+ fprintf(stderr, "Double \"mtu/minburst\" spec\n");
+ return -1;
+ }
+ if (get_size_and_cell(&mtu, &Pcell_log, *argv) < 0) {
+ explain1("mtu");
+ return -1;
+ }
+ } else if (strcmp(*argv, "mpu") == 0) {
+ NEXT_ARG();
+ if (mpu) {
+ fprintf(stderr, "Double \"mpu\" spec\n");
+ return -1;
+ }
+ if (get_size(&mpu, *argv)) {
+ explain1("mpu");
+ return -1;
+ }
+ } else if (strcmp(*argv, "rate") == 0) {
+ NEXT_ARG();
+ if (p.rate.rate) {
+ fprintf(stderr, "Double \"rate\" spec\n");
+ return -1;
+ }
+ if (get_rate(&p.rate.rate, *argv)) {
+ explain1("rate");
+ return -1;
+ }
+ } else if (strcmp(*argv, "avrate") == 0) {
+ NEXT_ARG();
+ if (avrate) {
+ fprintf(stderr, "Double \"avrate\" spec\n");
+ return -1;
+ }
+ if (get_rate(&avrate, *argv)) {
+ explain1("avrate");
+ return -1;
+ }
+ } else if (matches(*argv, "peakrate") == 0) {
+ NEXT_ARG();
+ if (p.peakrate.rate) {
+ fprintf(stderr, "Double \"peakrate\" spec\n");
+ return -1;
+ }
+ if (get_rate(&p.peakrate.rate, *argv)) {
+ explain1("peakrate");
+ return -1;
+ }
+ } else if (matches(*argv, "reclassify") == 0) {
+ p.action = TC_POLICE_RECLASSIFY;
+ } else if (matches(*argv, "drop") == 0 ||
+ matches(*argv, "shot") == 0) {
+ p.action = TC_POLICE_SHOT;
+ } else if (matches(*argv, "continue") == 0) {
+ p.action = TC_POLICE_UNSPEC;
+ } else if (matches(*argv, "pass") == 0) {
+ p.action = TC_POLICE_OK;
+ } else if (strcmp(*argv, "action") == 0) {
+ NEXT_ARG();
+ if (get_police_result(&p.action, &presult, *argv)) {
+ fprintf(stderr, "Illegal \"action\"\n");
+ return -1;
+ }
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ break;
+ }
+ ok++;
+ argc--; argv++;
+ }
+
+ if (!ok)
+ return -1;
+
+ if (p.rate.rate && !buffer) {
+ fprintf(stderr, "\"burst\" requires \"rate\".\n");
+ return -1;
+ }
+ if (p.peakrate.rate) {
+ if (!p.rate.rate) {
+ fprintf(stderr, "\"peakrate\" requires \"rate\".\n");
+ return -1;
+ }
+ if (!mtu) {
+ fprintf(stderr, "\"mtu\" is required, if \"peakrate\" is requested.\n");
+ return -1;
+ }
+ }
+
+ if (p.rate.rate) {
+ if ((Rcell_log = tc_calc_rtable(p.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) {
+ fprintf(stderr, "TBF: failed to calculate rate table.\n");
+ return -1;
+ }
+ p.burst = tc_calc_xmittime(p.rate.rate, buffer);
+ p.rate.cell_log = Rcell_log;
+ p.rate.mpu = mpu;
+ }
+ p.mtu = mtu;
+ if (p.peakrate.rate) {
+ if ((Pcell_log = tc_calc_rtable(p.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) {
+ fprintf(stderr, "POLICE: failed to calculate peak rate table.\n");
+ return -1;
+ }
+ p.peakrate.cell_log = Pcell_log;
+ p.peakrate.mpu = mpu;
+ }
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 1024, tca_id, NULL, 0);
+ addattr_l(n, 2024, TCA_POLICE_TBF, &p, sizeof(p));
+ if (p.rate.rate)
+ addattr_l(n, 3024, TCA_POLICE_RATE, rtab, 1024);
+ if (p.peakrate.rate)
+ addattr_l(n, 4096, TCA_POLICE_PEAKRATE, ptab, 1024);
+ if (avrate)
+ addattr32(n, 4096, TCA_POLICE_AVRATE, avrate);
+ if (presult)
+ addattr32(n, 4096, TCA_POLICE_RESULT, presult);
+#if 0
+#endif
+
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+ res = 0;
+
+ *argc_p = argc;
+ *argv_p = argv;
+ return res;
+}
+
+
+int tc_print_police(FILE *f, struct rtattr *arg)
+{
+ SPRINT_BUF(b1);
+ struct tc_police *p;
+ struct rtattr *tb[TCA_POLICE_MAX+1];
+ unsigned buffer;
+
+ if (arg == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_POLICE_MAX, RTA_DATA(arg), RTA_PAYLOAD(arg));
+
+ if (tb[TCA_POLICE_TBF] == NULL) {
+ fprintf(f, "[NULL police tbf]");
+ return 0;
+ }
+ if (RTA_PAYLOAD(tb[TCA_POLICE_TBF]) < sizeof(*p)) {
+ fprintf(f, "[truncated police tbf]");
+ return -1;
+ }
+ p = RTA_DATA(tb[TCA_POLICE_TBF]);
+
+ fprintf(f, "police %x ", p->index);
+ fprintf(f, "action %s", police_action_n2a(p->action, b1, sizeof(b1)));
+ if (tb[TCA_POLICE_RESULT]) {
+ fprintf(f, "/%s ", police_action_n2a(*(int*)RTA_DATA(tb[TCA_POLICE_RESULT]), b1, sizeof(b1)));
+ } else
+ fprintf(f, " ");
+ fprintf(f, "rate %s ", sprint_rate(p->rate.rate, b1));
+ buffer = ((double)p->rate.rate*tc_core_tick2usec(p->burst))/1000000;
+ fprintf(f, "burst %s ", sprint_size(buffer, b1));
+ fprintf(f, "mtu %s ", sprint_size(p->mtu, b1));
+ if (show_raw)
+ fprintf(f, "[%08x] ", p->burst);
+ if (p->peakrate.rate)
+ fprintf(f, "peakrate %s ", sprint_rate(p->peakrate.rate, b1));
+ if (tb[TCA_POLICE_AVRATE])
+ fprintf(f, "avrate %s ", sprint_rate(*(__u32*)RTA_DATA(tb[TCA_POLICE_AVRATE]), b1));
+
+ return 0;
+}
+
diff --git a/tc/q_atm.c b/tc/q_atm.c
index e69de29b..d1745387 100644
--- a/tc/q_atm.c
+++ b/tc/q_atm.c
@@ -0,0 +1,268 @@
+/*
+ * q_atm.c ATM.
+ *
+ * Hacked 1998-2000 by Werner Almesberger, EPFL ICA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmarp.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+
+#define MAX_HDR_LEN 64
+
+#define usage() return(-1)
+
+
+static int atm_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ if (argc) {
+ fprintf(stderr,"Usage: atm\n");
+ return -1;
+ }
+ return 0;
+}
+
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... atm ( pvc ADDR | svc ADDR [ sap SAP ] ) "
+ "[ qos QOS ] [ sndbuf BYTES ]\n");
+ fprintf(stderr, " [ hdr HEX... ] [ excess ( CLASSID | clp ) ] "
+ "[ clip ]\n");
+}
+
+
+static int atm_parse_class_opt(struct qdisc_util *qu, int argc, char **argv,
+ struct nlmsghdr *n)
+{
+ struct sockaddr_atmsvc addr;
+ struct atm_qos qos;
+ struct atm_sap sap;
+ unsigned char hdr[MAX_HDR_LEN];
+ __u32 excess = 0;
+ struct rtattr *tail;
+ int sndbuf = 0;
+ int hdr_len = -1;
+ int set_clip = 0;
+ int s;
+
+ memset(&addr,0,sizeof(addr));
+ (void) text2qos("aal5,ubr:sdu=9180,rx:none",&qos,0);
+ (void) text2sap("blli:l2=iso8802",&sap,0);
+ while (argc > 0) {
+ if (!strcmp(*argv,"pvc")) {
+ NEXT_ARG();
+ if (text2atm(*argv,(struct sockaddr *) &addr,
+ sizeof(addr),T2A_PVC | T2A_NAME) < 0) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"svc")) {
+ NEXT_ARG();
+ if (text2atm(*argv,(struct sockaddr *) &addr,
+ sizeof(addr),T2A_SVC | T2A_NAME) < 0) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"qos")) {
+ NEXT_ARG();
+ if (text2qos(*argv,&qos,0) < 0) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"sndbuf")) {
+ char *end;
+
+ NEXT_ARG();
+ sndbuf = strtol(*argv,&end,0);
+ if (*end) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"sap")) {
+ NEXT_ARG();
+ if (addr.sas_family != AF_ATMSVC ||
+ text2sap(*argv,&sap,T2A_NAME) < 0) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"hdr")) {
+ unsigned char *ptr;
+ char *walk;
+
+ NEXT_ARG();
+ ptr = hdr;
+ for (walk = *argv; *walk; walk++) {
+ int tmp;
+
+ if (ptr == hdr+MAX_HDR_LEN) {
+ fprintf(stderr,"header is too long\n");
+ return -1;
+ }
+ if (*walk == '.') continue;
+ if (!isxdigit(walk[0]) || !walk[1] ||
+ !isxdigit(walk[1])) {
+ explain();
+ return -1;
+ }
+ sscanf(walk,"%2x",&tmp);
+ *ptr++ = tmp;
+ walk++;
+ }
+ hdr_len = ptr-hdr;
+ }
+ else if (!strcmp(*argv,"excess")) {
+ NEXT_ARG();
+ if (!strcmp(*argv,"clp")) excess = 0;
+ else if (get_tc_classid(&excess,*argv)) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"clip")) {
+ set_clip = 1;
+ }
+ else {
+ explain();
+ return 1;
+ }
+ argc--;
+ argv++;
+ }
+ s = socket(addr.sas_family,SOCK_DGRAM,0);
+ if (s < 0) {
+ perror("socket");
+ return -1;
+ }
+ if (setsockopt(s,SOL_ATM,SO_ATMQOS,&qos,sizeof(qos)) < 0) {
+ perror("SO_ATMQOS");
+ return -1;
+ }
+ if (sndbuf)
+ if (setsockopt(s,SOL_SOCKET,SO_SNDBUF,&sndbuf,sizeof(sndbuf)) < 0) {
+ perror("SO_SNDBUF");
+ return -1;
+ }
+ if (addr.sas_family == AF_ATMSVC && setsockopt(s,SOL_ATM,SO_ATMSAP,
+ &sap,sizeof(sap)) < 0) {
+ perror("SO_ATMSAP");
+ return -1;
+ }
+ if (connect(s,(struct sockaddr *) &addr,addr.sas_family == AF_ATMPVC ?
+ sizeof(struct sockaddr_atmpvc) : sizeof(addr)) < 0) {
+ perror("connect");
+ return -1;
+ }
+ if (set_clip)
+ if (ioctl(s,ATMARP_MKIP,0) < 0) {
+ perror("ioctl ATMARP_MKIP");
+ return -1;
+ }
+ tail = (struct rtattr *) (((void *) n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n,1024,TCA_OPTIONS,NULL,0);
+ addattr_l(n,1024,TCA_ATM_FD,&s,sizeof(s));
+ if (excess) addattr_l(n,1024,TCA_ATM_EXCESS,&excess,sizeof(excess));
+ if (hdr_len != -1) addattr_l(n,1024,TCA_ATM_HDR,hdr,hdr_len);
+ tail->rta_len = (((void *) n)+NLMSG_ALIGN(n->nlmsg_len))-(void *) tail;
+ return 0;
+}
+
+
+
+static int atm_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_ATM_MAX+1];
+ char buffer[MAX_ATM_ADDR_LEN+1];
+
+ if (!opt) return 0;
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_ATM_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+ if (tb[TCA_ATM_ADDR]) {
+ if (RTA_PAYLOAD(tb[TCA_ATM_ADDR]) <
+ sizeof(struct sockaddr_atmpvc))
+ fprintf(stderr,"ATM: address too short\n");
+ else {
+ if (atm2text(buffer,MAX_ATM_ADDR_LEN,
+ RTA_DATA(tb[TCA_ATM_ADDR]),A2T_PRETTY | A2T_NAME) <
+ 0) fprintf(stderr,"atm2text error\n");
+ fprintf(f,"pvc %s ",buffer);
+ }
+ }
+ if (tb[TCA_ATM_HDR]) {
+ int i;
+
+ fprintf(f,"hdr");
+ for (i = 0; i < RTA_PAYLOAD(tb[TCA_ATM_HDR]); i++)
+ fprintf(f,"%c%02x",i ? '.' : ' ',
+ ((unsigned char *) RTA_DATA(tb[TCA_ATM_HDR]))[i]);
+ if (!i) fprintf(f," .");
+ fprintf(f," ");
+ }
+ if (tb[TCA_ATM_EXCESS]) {
+ __u32 excess;
+
+ if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS]) < sizeof(excess))
+ fprintf(stderr,"ATM: excess class ID too short\n");
+ else {
+ excess = *(__u32 *) RTA_DATA(tb[TCA_ATM_EXCESS]);
+ if (!excess) fprintf(f,"excess clp ");
+ else {
+ char buf[64];
+
+ print_tc_classid(buf,sizeof(buf),excess);
+ fprintf(f,"excess %s ",buf);
+ }
+ }
+ }
+ if (tb[TCA_ATM_STATE]) {
+ static const char *map[] = { ATM_VS2TXT_MAP };
+ int state;
+
+ if (RTA_PAYLOAD(tb[TCA_ATM_STATE]) < sizeof(state))
+ fprintf(stderr,"ATM: state field too short\n");
+ else {
+ state = *(int *) RTA_DATA(tb[TCA_ATM_STATE]);
+ fprintf(f,"%s ",map[state]);
+ }
+ }
+ return 0;
+}
+
+
+static int atm_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+
+struct qdisc_util atm_util = {
+ NULL,
+ "atm",
+ atm_parse_opt,
+ atm_print_opt,
+ atm_print_xstats,
+
+ atm_parse_class_opt,
+ atm_print_opt
+};
diff --git a/tc/q_cbq.c b/tc/q_cbq.c
index e69de29b..51ed87a2 100644
--- a/tc/q_cbq.c
+++ b/tc/q_cbq.c
@@ -0,0 +1,555 @@
+/*
+ * q_cbq.c CBQ.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+#include "tc_cbq.h"
+
+static void explain_class(void)
+{
+ fprintf(stderr, "Usage: ... cbq bandwidth BPS rate BPS maxburst PKTS [ avpkt BYTES ]\n");
+ fprintf(stderr, " [ minburst PKTS ] [ bounded ] [ isolated ]\n");
+ fprintf(stderr, " [ allot BYTES ] [ mpu BYTES ] [ weight RATE ]\n");
+ fprintf(stderr, " [ prio NUMBER ] [ cell BYTES ] [ ewma LOG ]\n");
+ fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n");
+ fprintf(stderr, " [ split CLASSID ] [ defmap MASK/CHANGE ]\n");
+}
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... cbq bandwidth BPS avpkt BYTES [ mpu BYTES ]\n");
+ fprintf(stderr, " [ cell BYTES ] [ ewma LOG ]\n");
+}
+
+static void explain1(char *arg)
+{
+ fprintf(stderr, "Illegal \"%s\"\n", arg);
+}
+
+#define usage() return(-1)
+
+static int cbq_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ struct tc_ratespec r;
+ struct tc_cbq_lssopt lss;
+ __u32 rtab[256];
+ unsigned mpu=0, avpkt=0, allot=0;
+ int cell_log=-1;
+ int ewma_log=-1;
+ struct rtattr *tail;
+
+ memset(&lss, 0, sizeof(lss));
+ memset(&r, 0, sizeof(r));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "bandwidth") == 0 ||
+ strcmp(*argv, "rate") == 0) {
+ NEXT_ARG();
+ if (get_rate(&r.rate, *argv)) {
+ explain1("bandwidth");
+ return -1;
+ }
+ } else if (strcmp(*argv, "ewma") == 0) {
+ NEXT_ARG();
+ if (get_unsigned(&ewma_log, *argv, 0)) {
+ explain1("ewma");
+ return -1;
+ }
+ if (ewma_log > 31) {
+ fprintf(stderr, "ewma_log must be < 32\n");
+ return -1;
+ }
+ } else if (strcmp(*argv, "cell") == 0) {
+ unsigned cell;
+ int i;
+ NEXT_ARG();
+ if (get_size(&cell, *argv)) {
+ explain1("cell");
+ return -1;
+ }
+ for (i=0; i<32; i++)
+ if ((1<<i) == cell)
+ break;
+ if (i>=32) {
+ fprintf(stderr, "cell must be 2^n\n");
+ return -1;
+ }
+ cell_log = i;
+ } else if (strcmp(*argv, "avpkt") == 0) {
+ NEXT_ARG();
+ if (get_size(&avpkt, *argv)) {
+ explain1("avpkt");
+ return -1;
+ }
+ } else if (strcmp(*argv, "mpu") == 0) {
+ NEXT_ARG();
+ if (get_size(&mpu, *argv)) {
+ explain1("mpu");
+ return -1;
+ }
+ } else if (strcmp(*argv, "allot") == 0) {
+ NEXT_ARG();
+ /* Accept and ignore "allot" for backward compatibility */
+ if (get_size(&allot, *argv)) {
+ explain1("allot");
+ return -1;
+ }
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ /* OK. All options are parsed. */
+
+ if (r.rate == 0) {
+ fprintf(stderr, "CBQ: bandwidth is required parameter.\n");
+ return -1;
+ }
+ if (avpkt == 0) {
+ fprintf(stderr, "CBQ: \"avpkt\" is required.\n");
+ return -1;
+ }
+ if (allot < (avpkt*3)/2)
+ allot = (avpkt*3)/2;
+
+ if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, allot, mpu)) < 0) {
+ fprintf(stderr, "CBQ: failed to calculate rate table.\n");
+ return -1;
+ }
+ r.cell_log = cell_log;
+ r.mpu = mpu;
+
+ if (ewma_log < 0)
+ ewma_log = TC_CBQ_DEF_EWMA;
+ lss.ewma_log = ewma_log;
+ lss.maxidle = tc_cbq_calc_maxidle(r.rate, r.rate, avpkt, lss.ewma_log, 0);
+ lss.change = TCF_CBQ_LSS_MAXIDLE|TCF_CBQ_LSS_EWMA|TCF_CBQ_LSS_AVPKT;
+ lss.avpkt = avpkt;
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ addattr_l(n, 1024, TCA_CBQ_RATE, &r, sizeof(r));
+ addattr_l(n, 1024, TCA_CBQ_LSSOPT, &lss, sizeof(lss));
+ addattr_l(n, 3024, TCA_CBQ_RTAB, rtab, 1024);
+ if (show_raw) {
+ int i;
+ for (i=0; i<256; i++)
+ printf("%u ", rtab[i]);
+ printf("\n");
+ }
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+ return 0;
+}
+
+static int cbq_parse_class_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int wrr_ok=0, fopt_ok=0;
+ struct tc_ratespec r;
+ struct tc_cbq_lssopt lss;
+ struct tc_cbq_wrropt wrr;
+ struct tc_cbq_fopt fopt;
+ struct tc_cbq_ovl ovl;
+ __u32 rtab[256];
+ unsigned mpu=0;
+ int cell_log=-1;
+ int ewma_log=-1;
+ unsigned bndw = 0;
+ unsigned minburst=0, maxburst=0;
+ struct rtattr *tail;
+
+ memset(&r, 0, sizeof(r));
+ memset(&lss, 0, sizeof(lss));
+ memset(&wrr, 0, sizeof(wrr));
+ memset(&fopt, 0, sizeof(fopt));
+ memset(&ovl, 0, sizeof(ovl));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "rate") == 0) {
+ NEXT_ARG();
+ if (get_rate(&r.rate, *argv)) {
+ explain1("rate");
+ return -1;
+ }
+ } else if (strcmp(*argv, "bandwidth") == 0) {
+ NEXT_ARG();
+ if (get_rate(&bndw, *argv)) {
+ explain1("bandwidth");
+ return -1;
+ }
+ } else if (strcmp(*argv, "minidle") == 0) {
+ NEXT_ARG();
+ if (get_u32(&lss.minidle, *argv, 0)) {
+ explain1("minidle");
+ return -1;
+ }
+ lss.change |= TCF_CBQ_LSS_MINIDLE;
+ } else if (strcmp(*argv, "minburst") == 0) {
+ NEXT_ARG();
+ if (get_u32(&minburst, *argv, 0)) {
+ explain1("minburst");
+ return -1;
+ }
+ lss.change |= TCF_CBQ_LSS_OFFTIME;
+ } else if (strcmp(*argv, "maxburst") == 0) {
+ NEXT_ARG();
+ if (get_u32(&maxburst, *argv, 0)) {
+ explain1("maxburst");
+ return -1;
+ }
+ lss.change |= TCF_CBQ_LSS_MAXIDLE;
+ } else if (strcmp(*argv, "bounded") == 0) {
+ lss.flags |= TCF_CBQ_LSS_BOUNDED;
+ lss.change |= TCF_CBQ_LSS_FLAGS;
+ } else if (strcmp(*argv, "borrow") == 0) {
+ lss.flags &= ~TCF_CBQ_LSS_BOUNDED;
+ lss.change |= TCF_CBQ_LSS_FLAGS;
+ } else if (strcmp(*argv, "isolated") == 0) {
+ lss.flags |= TCF_CBQ_LSS_ISOLATED;
+ lss.change |= TCF_CBQ_LSS_FLAGS;
+ } else if (strcmp(*argv, "sharing") == 0) {
+ lss.flags &= ~TCF_CBQ_LSS_ISOLATED;
+ lss.change |= TCF_CBQ_LSS_FLAGS;
+ } else if (strcmp(*argv, "ewma") == 0) {
+ NEXT_ARG();
+ if (get_u32(&ewma_log, *argv, 0)) {
+ explain1("ewma");
+ return -1;
+ }
+ if (ewma_log > 31) {
+ fprintf(stderr, "ewma_log must be < 32\n");
+ return -1;
+ }
+ lss.change |= TCF_CBQ_LSS_EWMA;
+ } else if (strcmp(*argv, "cell") == 0) {
+ unsigned cell;
+ int i;
+ NEXT_ARG();
+ if (get_size(&cell, *argv)) {
+ explain1("cell");
+ return -1;
+ }
+ for (i=0; i<32; i++)
+ if ((1<<i) == cell)
+ break;
+ if (i>=32) {
+ fprintf(stderr, "cell must be 2^n\n");
+ return -1;
+ }
+ cell_log = i;
+ } else if (strcmp(*argv, "prio") == 0) {
+ unsigned prio;
+ NEXT_ARG();
+ if (get_u32(&prio, *argv, 0)) {
+ explain1("prio");
+ return -1;
+ }
+ if (prio > TC_CBQ_MAXPRIO) {
+ fprintf(stderr, "\"prio\" must be number in the range 1...%d\n", TC_CBQ_MAXPRIO);
+ return -1;
+ }
+ wrr.priority = prio;
+ wrr_ok++;
+ } else if (strcmp(*argv, "allot") == 0) {
+ NEXT_ARG();
+ if (get_size(&wrr.allot, *argv)) {
+ explain1("allot");
+ return -1;
+ }
+ } else if (strcmp(*argv, "avpkt") == 0) {
+ NEXT_ARG();
+ if (get_size(&lss.avpkt, *argv)) {
+ explain1("avpkt");
+ return -1;
+ }
+ lss.change |= TCF_CBQ_LSS_AVPKT;
+ } else if (strcmp(*argv, "mpu") == 0) {
+ NEXT_ARG();
+ if (get_size(&mpu, *argv)) {
+ explain1("mpu");
+ return -1;
+ }
+ } else if (strcmp(*argv, "weight") == 0) {
+ NEXT_ARG();
+ if (get_size(&wrr.weight, *argv)) {
+ explain1("weight");
+ return -1;
+ }
+ wrr_ok++;
+ } else if (strcmp(*argv, "split") == 0) {
+ NEXT_ARG();
+ if (get_tc_classid(&fopt.split, *argv)) {
+ fprintf(stderr, "Invalid split node ID.\n");
+ usage();
+ }
+ fopt_ok++;
+ } else if (strcmp(*argv, "defmap") == 0) {
+ int err;
+ NEXT_ARG();
+ err = sscanf(*argv, "%08x/%08x", &fopt.defmap, &fopt.defchange);
+ if (err < 1) {
+ fprintf(stderr, "Invalid defmap, should be MASK32[/MASK]\n");
+ return -1;
+ }
+ if (err == 1)
+ fopt.defchange = ~0;
+ fopt_ok++;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain_class();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain_class();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ /* OK. All options are parsed. */
+
+ /* 1. Prepare link sharing scheduler parameters */
+ if (r.rate) {
+ unsigned pktsize = wrr.allot;
+ if (wrr.allot < (lss.avpkt*3)/2)
+ wrr.allot = (lss.avpkt*3)/2;
+ if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, pktsize, mpu)) < 0) {
+ fprintf(stderr, "CBQ: failed to calculate rate table.\n");
+ return -1;
+ }
+ r.cell_log = cell_log;
+ r.mpu = mpu;
+ }
+ if (ewma_log < 0)
+ ewma_log = TC_CBQ_DEF_EWMA;
+ lss.ewma_log = ewma_log;
+ if (lss.change&(TCF_CBQ_LSS_OFFTIME|TCF_CBQ_LSS_MAXIDLE)) {
+ if (lss.avpkt == 0) {
+ fprintf(stderr, "CBQ: avpkt is required for max/minburst.\n");
+ return -1;
+ }
+ if (bndw==0 || r.rate == 0) {
+ fprintf(stderr, "CBQ: bandwidth&rate are required for max/minburst.\n");
+ return -1;
+ }
+ }
+ if (wrr.priority == 0 && (n->nlmsg_flags&NLM_F_EXCL)) {
+ wrr_ok = 1;
+ wrr.priority = TC_CBQ_MAXPRIO;
+ if (wrr.allot == 0)
+ wrr.allot = (lss.avpkt*3)/2;
+ }
+ if (wrr_ok) {
+ if (wrr.weight == 0)
+ wrr.weight = (wrr.priority == TC_CBQ_MAXPRIO) ? 1 : r.rate;
+ if (wrr.allot == 0) {
+ fprintf(stderr, "CBQ: \"allot\" is required to set WRR parameters.\n");
+ return -1;
+ }
+ }
+ if (lss.change&TCF_CBQ_LSS_MAXIDLE) {
+ lss.maxidle = tc_cbq_calc_maxidle(bndw, r.rate, lss.avpkt, ewma_log, maxburst);
+ lss.change |= TCF_CBQ_LSS_MAXIDLE;
+ lss.change |= TCF_CBQ_LSS_EWMA|TCF_CBQ_LSS_AVPKT;
+ }
+ if (lss.change&TCF_CBQ_LSS_OFFTIME) {
+ lss.offtime = tc_cbq_calc_offtime(bndw, r.rate, lss.avpkt, ewma_log, minburst);
+ lss.change |= TCF_CBQ_LSS_OFFTIME;
+ lss.change |= TCF_CBQ_LSS_EWMA|TCF_CBQ_LSS_AVPKT;
+ }
+ if (lss.change&TCF_CBQ_LSS_MINIDLE) {
+ lss.minidle <<= lss.ewma_log;
+ lss.change |= TCF_CBQ_LSS_EWMA;
+ }
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ if (lss.change) {
+ lss.change |= TCF_CBQ_LSS_FLAGS;
+ addattr_l(n, 1024, TCA_CBQ_LSSOPT, &lss, sizeof(lss));
+ }
+ if (wrr_ok)
+ addattr_l(n, 1024, TCA_CBQ_WRROPT, &wrr, sizeof(wrr));
+ if (fopt_ok)
+ addattr_l(n, 1024, TCA_CBQ_FOPT, &fopt, sizeof(fopt));
+ if (r.rate) {
+ addattr_l(n, 1024, TCA_CBQ_RATE, &r, sizeof(r));
+ addattr_l(n, 3024, TCA_CBQ_RTAB, rtab, 1024);
+ if (show_raw) {
+ int i;
+ for (i=0; i<256; i++)
+ printf("%u ", rtab[i]);
+ printf("\n");
+ }
+ }
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+ return 0;
+}
+
+
+static int cbq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_CBQ_MAX+1];
+ struct tc_ratespec *r = NULL;
+ struct tc_cbq_lssopt *lss = NULL;
+ struct tc_cbq_wrropt *wrr = NULL;
+ struct tc_cbq_fopt *fopt = NULL;
+ struct tc_cbq_ovl *ovl = NULL;
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (tb[TCA_CBQ_RATE]) {
+ if (RTA_PAYLOAD(tb[TCA_CBQ_RATE]) < sizeof(*r))
+ fprintf(stderr, "CBQ: too short rate opt\n");
+ else
+ r = RTA_DATA(tb[TCA_CBQ_RATE]);
+ }
+ if (tb[TCA_CBQ_LSSOPT]) {
+ if (RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT]) < sizeof(*lss))
+ fprintf(stderr, "CBQ: too short lss opt\n");
+ else
+ lss = RTA_DATA(tb[TCA_CBQ_LSSOPT]);
+ }
+ if (tb[TCA_CBQ_WRROPT]) {
+ if (RTA_PAYLOAD(tb[TCA_CBQ_WRROPT]) < sizeof(*wrr))
+ fprintf(stderr, "CBQ: too short wrr opt\n");
+ else
+ wrr = RTA_DATA(tb[TCA_CBQ_WRROPT]);
+ }
+ if (tb[TCA_CBQ_FOPT]) {
+ if (RTA_PAYLOAD(tb[TCA_CBQ_FOPT]) < sizeof(*fopt))
+ fprintf(stderr, "CBQ: too short fopt\n");
+ else
+ fopt = RTA_DATA(tb[TCA_CBQ_FOPT]);
+ }
+ if (tb[TCA_CBQ_OVL_STRATEGY]) {
+ if (RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY]) < sizeof(*ovl))
+ fprintf(stderr, "CBQ: too short overlimit strategy %u/%u\n",
+ RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY]), sizeof(*ovl));
+ else
+ ovl = RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY]);
+ }
+
+ if (r) {
+ char buf[64];
+ print_rate(buf, sizeof(buf), r->rate);
+ fprintf(f, "rate %s ", buf);
+ if (show_details) {
+ fprintf(f, "cell %ub ", 1<<r->cell_log);
+ if (r->mpu)
+ fprintf(f, "mpu %ub ", r->mpu);
+ }
+ }
+ if (lss && lss->flags) {
+ int comma=0;
+ fprintf(f, "(");
+ if (lss->flags&TCF_CBQ_LSS_BOUNDED) {
+ fprintf(f, "bounded");
+ comma=1;
+ }
+ if (lss->flags&TCF_CBQ_LSS_ISOLATED) {
+ if (comma)
+ fprintf(f, ",");
+ fprintf(f, "isolated");
+ }
+ fprintf(f, ") ");
+ }
+ if (wrr) {
+ if (wrr->priority != TC_CBQ_MAXPRIO)
+ fprintf(f, "prio %u", wrr->priority);
+ else
+ fprintf(f, "prio no-transmit");
+ if (show_details) {
+ char buf[64];
+ fprintf(f, "/%u ", wrr->cpriority);
+ if (wrr->weight != 1) {
+ print_rate(buf, sizeof(buf), wrr->weight);
+ fprintf(f, "weight %s ", buf);
+ }
+ if (wrr->allot)
+ fprintf(f, "allot %ub ", wrr->allot);
+ }
+ }
+ if (lss && show_details) {
+ fprintf(f, "\nlevel %u ewma %u avpkt %ub ", lss->level, lss->ewma_log, lss->avpkt);
+ if (lss->maxidle) {
+ fprintf(f, "maxidle %luus ", tc_core_tick2usec(lss->maxidle>>lss->ewma_log));
+ if (show_raw)
+ fprintf(f, "[%08x] ", lss->maxidle);
+ }
+ if (lss->minidle!=0x7fffffff) {
+ fprintf(f, "minidle %luus ", tc_core_tick2usec(lss->minidle>>lss->ewma_log));
+ if (show_raw)
+ fprintf(f, "[%08x] ", lss->minidle);
+ }
+ if (lss->offtime) {
+ fprintf(f, "offtime %luus ", tc_core_tick2usec(lss->offtime));
+ if (show_raw)
+ fprintf(f, "[%08x] ", lss->offtime);
+ }
+ }
+ if (fopt && show_details) {
+ char buf[64];
+ print_tc_classid(buf, sizeof(buf), fopt->split);
+ fprintf(f, "\nsplit %s ", buf);
+ if (fopt->defmap) {
+ fprintf(f, "defmap %08x", fopt->defmap);
+ }
+ }
+ return 0;
+}
+
+static int cbq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ struct tc_cbq_xstats *st;
+
+ if (xstats == NULL)
+ return 0;
+
+ if (RTA_PAYLOAD(xstats) < sizeof(*st))
+ return -1;
+
+ st = RTA_DATA(xstats);
+ fprintf(f, " borrowed %u overactions %u avgidle %g undertime %g", st->borrows,
+ st->overactions, (double)st->avgidle, (double)st->undertime);
+ return 0;
+}
+
+struct qdisc_util cbq_util = {
+ NULL,
+ "cbq",
+ cbq_parse_opt,
+ cbq_print_opt,
+ cbq_print_xstats,
+
+ cbq_parse_class_opt,
+ cbq_print_opt,
+};
+
diff --git a/tc/q_csz.c b/tc/q_csz.c
index e69de29b..e2734cda 100644
--- a/tc/q_csz.c
+++ b/tc/q_csz.c
@@ -0,0 +1,61 @@
+/*
+ * q_csz.c CSZ.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain()
+{
+ fprintf(stderr, "Usage: ... csz \n");
+}
+
+static void explain1(char *arg)
+{
+ fprintf(stderr, "Illegal \"%s\"\n", arg);
+}
+
+
+#define usage() return(-1)
+
+static int csz_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ return -1;
+}
+
+static int csz_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ return -1;
+}
+
+static int csz_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return -1;
+}
+
+struct qdisc_util csz_util = {
+ NULL,
+ "csz",
+ csz_parse_opt,
+ csz_print_opt,
+ csz_print_xstats,
+};
+
diff --git a/tc/q_dsmark.c b/tc/q_dsmark.c
index e69de29b..8a1cd4d8 100644
--- a/tc/q_dsmark.c
+++ b/tc/q_dsmark.c
@@ -0,0 +1,186 @@
+/*
+ * q_dsmark.c Differentiated Services field marking.
+ *
+ * Hacked 1998,1999 by Werner Almesberger, EPFL ICA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+
+#define usage() return(-1)
+
+
+static void explain(void)
+{
+ fprintf(stderr,"Usage: dsmark indices INDICES [ default_index "
+ "DEFAULT_INDEX ] [ set_tc_index ]\n");
+}
+
+
+static int dsmark_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+ struct nlmsghdr *n)
+{
+ struct rtattr *tail;
+ __u16 ind;
+ char *end;
+ int dflt,set_tc_index;
+
+ ind = set_tc_index = 0;
+ dflt = -1;
+ while (argc > 0) {
+ if (!strcmp(*argv,"indices")) {
+ NEXT_ARG();
+ ind = strtoul(*argv,&end,0);
+ if (*end) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"default_index") || !strcmp(*argv,
+ "default")) {
+ NEXT_ARG();
+ dflt = strtoul(*argv,&end,0);
+ if (*end) {
+ explain();
+ return -1;
+ }
+ }
+ else if (!strcmp(*argv,"set_tc_index")) {
+ set_tc_index = 1;
+ }
+ else {
+ explain();
+ return -1;
+ }
+ argc--;
+ argv++;
+ }
+ if (!ind) {
+ explain();
+ return -1;
+ }
+ tail = (struct rtattr *) (((void *) n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n,1024,TCA_OPTIONS,NULL,0);
+ addattr_l(n,1024,TCA_DSMARK_INDICES,&ind,sizeof(ind));
+ if (dflt != -1) {
+ __u16 tmp = dflt;
+
+ addattr_l(n,1024,TCA_DSMARK_DEFAULT_INDEX,&tmp,sizeof(tmp));
+ }
+ if (set_tc_index) addattr_l(n,1024,TCA_DSMARK_SET_TC_INDEX,NULL,0);
+ tail->rta_len = (((void *) n)+n->nlmsg_len)-(void *) tail;
+ return 0;
+}
+
+
+static void explain_class(void)
+{
+ fprintf(stderr, "Usage: ... dsmark [ mask MASK ] [ value VALUE ]\n");
+}
+
+
+static int dsmark_parse_class_opt(struct qdisc_util *qu, int argc, char **argv,
+ struct nlmsghdr *n)
+{
+ struct rtattr *tail;
+ __u8 tmp;
+ char *end;
+
+ tail = (struct rtattr *) (((void *) n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n,1024,TCA_OPTIONS,NULL,0);
+ while (argc > 0) {
+ if (!strcmp(*argv,"mask")) {
+ NEXT_ARG();
+ tmp = strtoul(*argv,&end,0);
+ if (*end) {
+ explain_class();
+ return -1;
+ }
+ addattr_l(n,1024,TCA_DSMARK_MASK,&tmp,1);
+ }
+ else if (!strcmp(*argv,"value")) {
+ NEXT_ARG();
+ tmp = strtoul(*argv,&end,0);
+ if (*end) {
+ explain_class();
+ return -1;
+ }
+ addattr_l(n,1024,TCA_DSMARK_VALUE,&tmp,1);
+ }
+ else {
+ explain_class();
+ return -1;
+ }
+ argc--;
+ argv++;
+ }
+ tail->rta_len = (((void *) n)+n->nlmsg_len)-(void *) tail;
+ return 0;
+}
+
+
+
+static int dsmark_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_DSMARK_MAX+1];
+
+ if (!opt) return 0;
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_DSMARK_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+ if (tb[TCA_DSMARK_MASK]) {
+ if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK]))
+ fprintf(stderr,"dsmark: empty mask\n");
+ else fprintf(f,"mask 0x%02x ",
+ *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK]));
+ }
+ if (tb[TCA_DSMARK_VALUE]) {
+ if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE]))
+ fprintf(stderr,"dsmark: empty value\n");
+ else fprintf(f,"value 0x%02x ",
+ *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE]));
+ }
+ if (tb[TCA_DSMARK_INDICES]) {
+ if (RTA_PAYLOAD(tb[TCA_DSMARK_INDICES]) < sizeof(__u16))
+ fprintf(stderr,"dsmark: indices too short\n");
+ else fprintf(f,"indices 0x%04x ",
+ *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES]));
+ }
+ if (tb[TCA_DSMARK_DEFAULT_INDEX]) {
+ if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX]) < sizeof(__u16))
+ fprintf(stderr,"dsmark: default_index too short\n");
+ else fprintf(f,"default_index 0x%04x ",
+ *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX]));
+ }
+ if (tb[TCA_DSMARK_SET_TC_INDEX]) fprintf(f,"set_tc_index ");
+ return 0;
+}
+
+
+static int dsmark_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+
+struct qdisc_util dsmark_util = {
+ NULL,
+ "dsmark",
+ dsmark_parse_opt,
+ dsmark_print_opt,
+ dsmark_print_xstats,
+
+ dsmark_parse_class_opt,
+ dsmark_print_opt
+};
diff --git a/tc/q_fifo.c b/tc/q_fifo.c
index e69de29b..4cb9fded 100644
--- a/tc/q_fifo.c
+++ b/tc/q_fifo.c
@@ -0,0 +1,101 @@
+/*
+ * q_fifo.c FIFO.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... [p|b]fifo [ limit NUMBER ]\n");
+}
+
+#define usage() return(-1)
+
+static int fifo_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int ok=0;
+ struct tc_fifo_qopt opt;
+ memset(&opt, 0, sizeof(opt));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "limit") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.limit, *argv)) {
+ fprintf(stderr, "Illegal \"limit\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (ok)
+ addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt));
+ return 0;
+}
+
+static int fifo_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct tc_fifo_qopt *qopt;
+
+ if (opt == NULL)
+ return 0;
+
+ if (RTA_PAYLOAD(opt) < sizeof(*qopt))
+ return -1;
+ qopt = RTA_DATA(opt);
+ if (strcmp(qu->id, "bfifo") == 0) {
+ SPRINT_BUF(b1);
+ fprintf(f, "limit %s", sprint_size(qopt->limit, b1));
+ } else
+ fprintf(f, "limit %up", qopt->limit);
+ return 0;
+}
+
+static int fifo_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+
+struct qdisc_util bfifo_util = {
+ NULL,
+ "bfifo",
+ fifo_parse_opt,
+ fifo_print_opt,
+ fifo_print_xstats,
+};
+
+struct qdisc_util pfifo_util = {
+ NULL,
+ "pfifo",
+ fifo_parse_opt,
+ fifo_print_opt,
+ fifo_print_xstats,
+};
diff --git a/tc/q_gred.c b/tc/q_gred.c
index e69de29b..b63f8ae7 100644
--- a/tc/q_gred.c
+++ b/tc/q_gred.c
@@ -0,0 +1,345 @@
+/*
+ * q_gred.c GRED.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: J Hadi Salim(hadi@nortelnetworks.com)
+ * code ruthlessly ripped from
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+#include "tc_red.h"
+
+
+#if 0
+#define DPRINTF(format,args...) fprintf(stderr,format,##args)
+#else
+#define DPRINTF(format,args...)
+#endif
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... gred DP drop-probability limit BYTES "
+ "min BYTES max BYTES\n");
+ fprintf(stderr, " avpkt BYTES burst PACKETS probability PROBABILITY "
+ "bandwidth KBPS\n");
+ fprintf(stderr, " [prio value]\n");
+ fprintf(stderr," OR ...\n");
+ fprintf(stderr," gred setup DPs <num of DPs> default <default DP> "
+ "[grio]\n");
+}
+
+#define usage() return(-1)
+
+static int init_gred(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+
+ struct rtattr *tail;
+ struct tc_gred_sopt opt;
+ memset(&opt, 0, sizeof(struct tc_gred_sopt));
+
+ while (argc > 0) {
+ DPRINTF(stderr,"init_gred: invoked with %s\n",*argv);
+ if (strcmp(*argv, "DPs") == 0) {
+ NEXT_ARG();
+ DPRINTF(stderr,"init_gred: next_arg with %s\n",*argv);
+ opt.DPs=strtol(*argv, (char **)NULL, 10);
+ if (opt.DPs >MAX_DPs) { /* need a better error check */
+ fprintf(stderr, "DPs =%u \n",opt.DPs);
+ fprintf(stderr, "Illegal \"DPs\"\n");
+ fprintf(stderr, "GRED: only %d DPs are "
+ "currently supported\n",MAX_DPs);
+ return -1;
+ }
+ } else if (strcmp(*argv, "default") == 0) {
+ NEXT_ARG();
+ opt.def_DP=strtol(*argv, (char **)NULL, 10);
+ if (!opt.DPs) {
+ fprintf(stderr, "\"default DP\" must be "
+ "defined after DPs\n");
+ return -1;
+ }
+#if 0
+ if (opt.def_DP>opt.DPs-1) {
+#endif
+ if (opt.def_DP>opt.DPs) {
+/*
+ fprintf(stderr, "\"default DP\" must be less than %d\nNote: DP runs from 0 to %d for %d DPs\n",opt.DPs,opt.DPs-1,opt.DPs);
+*/
+ fprintf(stderr, "\"default DP\" must be less than %d\n",opt.DPs);
+ return -1;
+ }
+ } else if (strcmp(*argv, "grio") == 0) {
+ opt.grio=1;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+}
+
+if ((!opt.DPs) || (!opt.def_DP))
+{
+ fprintf(stderr, "Illegal gred setup parameters \n");
+ return -1;
+}
+DPRINTF("TC_GRED: sending DPs=%d default=%d\n",opt.DPs,opt.def_DP);
+ n->nlmsg_flags|=NLM_F_CREATE;
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ addattr_l(n, 1024, TCA_GRED_DPS, &opt, sizeof(struct tc_gred_sopt));
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+return 0;
+}
+/*
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+*/
+static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int ok=0;
+ struct tc_gred_qopt opt;
+ unsigned burst = 0;
+ unsigned avpkt = 0;
+ double probability = 0.02;
+ unsigned rate = 0;
+ int wlog;
+ __u8 sbuf[256];
+ struct rtattr *tail;
+
+ memset(&opt, 0, sizeof(opt));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "limit") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.limit, *argv)) {
+ fprintf(stderr, "Illegal \"limit\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "setup") == 0) {
+ if (ok) {
+ fprintf(stderr, "Illegal \"setup\"\n");
+ return -1;
+ }
+ return init_gred(qu,argc-1, argv+1,n);
+
+ } else if (strcmp(*argv, "min") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.qth_min, *argv)) {
+ fprintf(stderr, "Illegal \"min\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "max") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.qth_max, *argv)) {
+ fprintf(stderr, "Illegal \"max\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "DP") == 0) {
+ NEXT_ARG();
+ opt.DP=strtol(*argv, (char **)NULL, 10);
+ DPRINTF ("\n ******* DP =%u\n",opt.DP);
+ if (opt.DP >MAX_DPs) { /* need a better error check */
+ fprintf(stderr, "DP =%u \n",opt.DP);
+ fprintf(stderr, "Illegal \"DP\"\n");
+ fprintf(stderr, "GRED: only %d DPs are currently supported\n",MAX_DPs);
+ return -1;
+ }
+#if 0
+ return -1;
+ }
+#endif
+ ok++;
+ } else if (strcmp(*argv, "burst") == 0) {
+ NEXT_ARG();
+ if (get_unsigned(&burst, *argv, 0)) {
+ fprintf(stderr, "Illegal \"burst\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "avpkt") == 0) {
+ NEXT_ARG();
+ if (get_size(&avpkt, *argv)) {
+ fprintf(stderr, "Illegal \"avpkt\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "probability") == 0) {
+ NEXT_ARG();
+ if (sscanf(*argv, "%lg", &probability) != 1) {
+ fprintf(stderr, "Illegal \"probability\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "prio") == 0) {
+ NEXT_ARG();
+ opt.prio=strtol(*argv, (char **)NULL, 10);
+ /* some error check here */
+ ok++;
+ } else if (strcmp(*argv, "bandwidth") == 0) {
+ NEXT_ARG();
+ if (get_rate(&rate, *argv)) {
+ fprintf(stderr, "Illegal \"bandwidth\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (!ok)
+ return 0;
+
+ if (rate == 0)
+ get_rate(&rate, "10Mbit");
+
+ if (!opt.qth_min || !opt.qth_max || !burst || !opt.limit || !avpkt ||
+ (opt.DP<0)) {
+ fprintf(stderr, "Required parameter (min, max, burst, limit, "
+ "avpket, DP) is missing\n");
+ return -1;
+ }
+
+ if ((wlog = tc_red_eval_ewma(opt.qth_min, burst, avpkt)) < 0) {
+ fprintf(stderr, "GRED: failed to calculate EWMA constant.\n");
+ return -1;
+ }
+ if (wlog >= 10)
+ fprintf(stderr, "GRED: WARNING. Burst %d seems to be to "
+ "large.\n", burst);
+ opt.Wlog = wlog;
+ if ((wlog = tc_red_eval_P(opt.qth_min, opt.qth_max, probability)) < 0) {
+ fprintf(stderr, "GRED: failed to calculate probability.\n");
+ return -1;
+ }
+ opt.Plog = wlog;
+ if ((wlog = tc_red_eval_idle_damping(opt.Wlog, avpkt, rate, sbuf)) < 0)
+ {
+ fprintf(stderr, "GRED: failed to calculate idle damping "
+ "table.\n");
+ return -1;
+ }
+ opt.Scell_log = wlog;
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ addattr_l(n, 1024, TCA_GRED_PARMS, &opt, sizeof(opt));
+ addattr_l(n, 1024, TCA_GRED_STAB, sbuf, 256);
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+ return 0;
+}
+
+static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_GRED_STAB+1];
+ struct tc_gred_qopt *qopt;
+ int i;
+ SPRINT_BUF(b1);
+ SPRINT_BUF(b2);
+ SPRINT_BUF(b3);
+ SPRINT_BUF(b4);
+ SPRINT_BUF(b5);
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (tb[TCA_GRED_PARMS] == NULL)
+ return -1;
+#if 0
+ sopt = RTA_DATA(tb[TCA_GRED_DPS]);
+ if (RTA_PAYLOAD(tb[TCA_GRED_DPS]) < sizeof(*sopt)) {
+ printf("\n GRED DPs message smaller than expected\n");
+ return -1;
+ }
+
+ DPRINTF(f, "\n\tDPs:%d Default DP %d\n ",
+ sopt->DPs, sopt->def_DP);
+#endif
+ qopt = RTA_DATA(tb[TCA_GRED_PARMS]);
+ if (RTA_PAYLOAD(tb[TCA_GRED_PARMS]) < sizeof(*qopt)*MAX_DPs) {
+ fprintf(f,"\n GRED received message smaller than expected\n");
+ return -1;
+ }
+
+
+#if 0
+
+ for (i=0;i<sopt->DPs;i++)
+#endif
+/* Bad hack! should really return a proper message as shown above*/
+
+ for (i=0;i<MAX_DPs;i++, qopt++) {
+ if (qopt->DP >= MAX_DPs) continue;
+ fprintf(f, "\n DP:%d (prio %d) Average Queue %s Measured "
+ "Queue %s ",
+ qopt->DP,
+ qopt->prio,
+ sprint_size(qopt->qave, b4),
+ sprint_size(qopt->backlog, b5));
+ fprintf(f, "\n\t Packet drops: %d (forced %d early %d) ",
+ qopt->forced+qopt->early,
+ qopt->forced,
+ qopt->early);
+ fprintf(f, "\n\t Packet totals: %u (bytes %u) ",
+ qopt->packets,
+ qopt->bytesin);
+ if (show_details)
+ fprintf(f, "\n limit %s min %s max %s ",
+ sprint_size(qopt->limit, b1),
+ sprint_size(qopt->qth_min, b2),
+ sprint_size(qopt->qth_max, b3));
+ fprintf(f, "ewma %u Plog %u Scell_log %u",
+ qopt->Wlog, qopt->Plog, qopt->Scell_log);
+ }
+ return 0;
+}
+
+static int gred_print_xstats(struct qdisc_util *qu, FILE *f,
+ struct rtattr *xstats)
+{
+ return 0;
+}
+
+
+struct qdisc_util gred_util = {
+ NULL,
+ "gred",
+ gred_parse_opt,
+ gred_print_opt,
+ gred_print_xstats,
+};
diff --git a/tc/q_hfsc.c b/tc/q_hfsc.c
index e69de29b..b9b7b751 100644
--- a/tc/q_hfsc.c
+++ b/tc/q_hfsc.c
@@ -0,0 +1,61 @@
+/*
+ * q_hfsc.c HFSC.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain()
+{
+ fprintf(stderr, "Usage: ... hfsc \n");
+}
+
+static void explain1(char *arg)
+{
+ fprintf(stderr, "Illegal \"%s\"\n", arg);
+}
+
+
+#define usage() return(-1)
+
+static int hfsc_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ return -1;
+}
+
+static int hfsc_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ return -1;
+}
+
+static int hfsc_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return -1;
+}
+
+struct qdisc_util hfsc_util = {
+ NULL,
+ "hfsc",
+ hfsc_parse_opt,
+ hfsc_print_opt,
+ hfsc_print_xstats,
+};
+
diff --git a/tc/q_hpfq.c b/tc/q_hpfq.c
index e69de29b..c2963669 100644
--- a/tc/q_hpfq.c
+++ b/tc/q_hpfq.c
@@ -0,0 +1,61 @@
+/*
+ * q_hpfq.c HPFQ.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain()
+{
+ fprintf(stderr, "Usage: ... hpfq \n");
+}
+
+static void explain1(char *arg)
+{
+ fprintf(stderr, "Illegal \"%s\"\n", arg);
+}
+
+
+#define usage() return(-1)
+
+static int hpfq_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ return -1;
+}
+
+static int hpfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ return -1;
+}
+
+static int hpfq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return -1;
+}
+
+struct qdisc_util hpfq_util = {
+ NULL,
+ "hpfq",
+ hpfq_parse_opt,
+ hpfq_print_opt,
+ hpfq_print_xstats,
+};
+
diff --git a/tc/q_ingress.c b/tc/q_ingress.c
index e69de29b..0a089062 100644
--- a/tc/q_ingress.c
+++ b/tc/q_ingress.c
@@ -0,0 +1,76 @@
+/*
+ *
+ * q_ingress.c INGRESS.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: J Hadi Salim
+ *
+ * This is here just in case it is needed
+ * useless right now; might be useful in the future
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... ingress \n");
+}
+
+#define usage() return(-1)
+
+static int ingress_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+
+ if (argc > 0) {
+ while (argc > 0) {
+
+ if (strcmp(*argv, "handle") == 0) {
+ NEXT_ARG();
+ argc--; argv++;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ }
+ }
+
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ return 0;
+}
+
+static int ingress_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+
+ fprintf(f, "---------------- ");
+ return 0;
+}
+
+static int ingress_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+struct qdisc_util ingress_util = {
+ NULL,
+ "ingress",
+ ingress_parse_opt,
+ ingress_print_opt,
+ ingress_print_xstats,
+};
diff --git a/tc/q_prio.c b/tc/q_prio.c
index e69de29b..ddda601b 100644
--- a/tc/q_prio.c
+++ b/tc/q_prio.c
@@ -0,0 +1,127 @@
+/*
+ * q_prio.c PRIO.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Ole Husgaard <sparre@login.dknet.dk>: 990513: prio2band map was always reset.
+ * J Hadi Salim <hadi@cyberus.ca>: 990609: priomap fix.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... prio bands NUMBER priomap P1 P2...\n");
+}
+
+#define usage() return(-1)
+
+static int prio_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int ok=0;
+ int pmap_mode = 0;
+ int idx = 0;
+ struct tc_prio_qopt opt={3,{ 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }};
+
+ while (argc > 0) {
+ if (strcmp(*argv, "bands") == 0) {
+ if (pmap_mode)
+ explain();
+ NEXT_ARG();
+ if (get_integer(&opt.bands, *argv, 10)) {
+ fprintf(stderr, "Illegal \"bands\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "priomap") == 0) {
+ if (pmap_mode) {
+ fprintf(stderr, "Error: duplicate priomap\n");
+ return -1;
+ }
+ pmap_mode = 1;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ unsigned band;
+ if (!pmap_mode) {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ if (get_unsigned(&band, *argv, 10)) {
+ fprintf(stderr, "Illegal \"priomap\" element\n");
+ return -1;
+ }
+ if (band > opt.bands) {
+ fprintf(stderr, "\"priomap\" element is out of bands\n");
+ return -1;
+ }
+ if (idx > TC_PRIO_MAX) {
+ fprintf(stderr, "\"priomap\" index > TC_PRIO_MAX=%u\n", TC_PRIO_MAX);
+ return -1;
+ }
+ opt.priomap[idx++] = band;
+ }
+ argc--; argv++;
+ }
+
+/*
+ if (pmap_mode) {
+ for (; idx < TC_PRIO_MAX; idx++)
+ opt.priomap[idx] = opt.priomap[TC_PRIO_BESTEFFORT];
+ }
+*/
+ addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt));
+ return 0;
+}
+
+static int prio_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ int i;
+ struct tc_prio_qopt *qopt;
+
+ if (opt == NULL)
+ return 0;
+
+ if (RTA_PAYLOAD(opt) < sizeof(*qopt))
+ return -1;
+ qopt = RTA_DATA(opt);
+ fprintf(f, "bands %u priomap ", qopt->bands);
+ for (i=0; i<=TC_PRIO_MAX; i++)
+ fprintf(f, " %d", qopt->priomap[i]);
+ return 0;
+}
+
+static int prio_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+
+struct qdisc_util prio_util = {
+ NULL,
+ "prio",
+ prio_parse_opt,
+ prio_print_opt,
+ prio_print_xstats,
+};
+
diff --git a/tc/q_red.c b/tc/q_red.c
index e69de29b..c156d47c 100644
--- a/tc/q_red.c
+++ b/tc/q_red.c
@@ -0,0 +1,222 @@
+/*
+ * q_red.c RED.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+#include "tc_red.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... red limit BYTES min BYTES max BYTES avpkt BYTES burst PACKETS\n");
+ fprintf(stderr, " probability PROBABILITY bandwidth KBPS [ ecn ]\n");
+}
+
+#define usage() return(-1)
+
+static int red_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int ok=0;
+ struct tc_red_qopt opt;
+ unsigned burst = 0;
+ unsigned avpkt = 0;
+ double probability = 0.02;
+ unsigned rate = 0;
+ int ecn_ok = 0;
+ int wlog;
+ __u8 sbuf[256];
+ struct rtattr *tail;
+
+ memset(&opt, 0, sizeof(opt));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "limit") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.limit, *argv)) {
+ fprintf(stderr, "Illegal \"limit\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "min") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.qth_min, *argv)) {
+ fprintf(stderr, "Illegal \"min\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "max") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.qth_max, *argv)) {
+ fprintf(stderr, "Illegal \"max\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "burst") == 0) {
+ NEXT_ARG();
+ if (get_unsigned(&burst, *argv, 0)) {
+ fprintf(stderr, "Illegal \"burst\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "avpkt") == 0) {
+ NEXT_ARG();
+ if (get_size(&avpkt, *argv)) {
+ fprintf(stderr, "Illegal \"avpkt\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "probability") == 0) {
+ NEXT_ARG();
+ if (sscanf(*argv, "%lg", &probability) != 1) {
+ fprintf(stderr, "Illegal \"probability\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "bandwidth") == 0) {
+ NEXT_ARG();
+ if (get_rate(&rate, *argv)) {
+ fprintf(stderr, "Illegal \"bandwidth\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "ecn") == 0) {
+ ecn_ok = 1;
+ ok++;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (!ok)
+ return 0;
+
+ if (rate == 0)
+ get_rate(&rate, "10Mbit");
+
+ if (!opt.qth_min || !opt.qth_max || !burst || !opt.limit || !avpkt) {
+ fprintf(stderr, "Required parameter (min, max, burst, limit, avpket) is missing\n");
+ return -1;
+ }
+
+ if ((wlog = tc_red_eval_ewma(opt.qth_min, burst, avpkt)) < 0) {
+ fprintf(stderr, "RED: failed to calculate EWMA constant.\n");
+ return -1;
+ }
+ if (wlog >= 10)
+ fprintf(stderr, "RED: WARNING. Burst %d seems to be to large.\n", burst);
+ opt.Wlog = wlog;
+ if ((wlog = tc_red_eval_P(opt.qth_min, opt.qth_max, probability)) < 0) {
+ fprintf(stderr, "RED: failed to calculate probability.\n");
+ return -1;
+ }
+ opt.Plog = wlog;
+ if ((wlog = tc_red_eval_idle_damping(opt.Wlog, avpkt, rate, sbuf)) < 0) {
+ fprintf(stderr, "RED: failed to calculate idle damping table.\n");
+ return -1;
+ }
+ opt.Scell_log = wlog;
+ if (ecn_ok) {
+#ifdef TC_RED_ECN
+ opt.flags |= TC_RED_ECN;
+#else
+ fprintf(stderr, "RED: ECN support is missing in this binary.\n");
+ return -1;
+#endif
+ }
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ addattr_l(n, 1024, TCA_RED_PARMS, &opt, sizeof(opt));
+ addattr_l(n, 1024, TCA_RED_STAB, sbuf, 256);
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+ return 0;
+}
+
+static int red_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_RED_STAB+1];
+ struct tc_red_qopt *qopt;
+ SPRINT_BUF(b1);
+ SPRINT_BUF(b2);
+ SPRINT_BUF(b3);
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (tb[TCA_RED_PARMS] == NULL)
+ return -1;
+ qopt = RTA_DATA(tb[TCA_RED_PARMS]);
+ if (RTA_PAYLOAD(tb[TCA_RED_PARMS]) < sizeof(*qopt))
+ return -1;
+ fprintf(f, "limit %s min %s max %s ",
+ sprint_size(qopt->limit, b1),
+ sprint_size(qopt->qth_min, b2),
+ sprint_size(qopt->qth_max, b3));
+#ifdef TC_RED_ECN
+ if (qopt->flags & TC_RED_ECN)
+ fprintf(f, "ecn ");
+#endif
+ if (show_details) {
+ fprintf(f, "ewma %u Plog %u Scell_log %u",
+ qopt->Wlog, qopt->Plog, qopt->Scell_log);
+ }
+ return 0;
+}
+
+static int red_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+#ifdef TC_RED_ECN
+ struct tc_red_xstats *st;
+
+ if (xstats == NULL)
+ return 0;
+
+ if (RTA_PAYLOAD(xstats) < sizeof(*st))
+ return -1;
+
+ st = RTA_DATA(xstats);
+ fprintf(f, " marked %u early %u pdrop %u other %u",
+ st->marked, st->early, st->pdrop, st->other);
+ return 0;
+
+#endif
+ return 0;
+}
+
+
+struct qdisc_util red_util = {
+ NULL,
+ "red",
+ red_parse_opt,
+ red_print_opt,
+ red_print_xstats,
+};
diff --git a/tc/q_sfq.c b/tc/q_sfq.c
index e69de29b..d7a3c0fa 100644
--- a/tc/q_sfq.c
+++ b/tc/q_sfq.c
@@ -0,0 +1,115 @@
+/*
+ * q_sfq.c SFQ.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [ quantum BYTES ]\n");
+}
+
+#define usage() return(-1)
+
+static int sfq_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int ok=0;
+ struct tc_sfq_qopt opt;
+
+ memset(&opt, 0, sizeof(opt));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "quantum") == 0) {
+ NEXT_ARG();
+ if (get_size(&opt.quantum, *argv)) {
+ fprintf(stderr, "Illegal \"limit\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "perturb") == 0) {
+ NEXT_ARG();
+ if (get_integer(&opt.perturb_period, *argv, 0)) {
+ fprintf(stderr, "Illegal \"perturb\"\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "limit") == 0) {
+ NEXT_ARG();
+ if (get_u32(&opt.limit, *argv, 0)) {
+ fprintf(stderr, "Illegal \"limit\"\n");
+ return -1;
+ }
+ if (opt.limit < 2) {
+ fprintf(stderr, "Illegal \"limit\", must be > 1\n");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (ok)
+ addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt));
+ return 0;
+}
+
+static int sfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct tc_sfq_qopt *qopt;
+ SPRINT_BUF(b1);
+
+ if (opt == NULL)
+ return 0;
+
+ if (RTA_PAYLOAD(opt) < sizeof(*qopt))
+ return -1;
+ qopt = RTA_DATA(opt);
+ fprintf(f, "limit %up ", qopt->limit);
+ fprintf(f, "quantum %s ", sprint_size(qopt->quantum, b1));
+ if (show_details) {
+ fprintf(f, "flows %u/%u ", qopt->flows, qopt->divisor);
+ }
+ if (qopt->perturb_period)
+ fprintf(f, "perturb %dsec ", qopt->perturb_period);
+ return 0;
+}
+
+static int sfq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+
+struct qdisc_util sfq_util = {
+ NULL,
+ "sfq",
+ sfq_parse_opt,
+ sfq_print_opt,
+ sfq_print_xstats,
+};
diff --git a/tc/q_tbf.c b/tc/q_tbf.c
index e69de29b..01d514fb 100644
--- a/tc/q_tbf.c
+++ b/tc/q_tbf.c
@@ -0,0 +1,272 @@
+/*
+ * q_tbf.c TBF.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: ... tbf limit BYTES burst BYTES[/BYTES] rate KBPS [ mtu BYTES[/BYTES] ]\n");
+ fprintf(stderr, " [ peakrate KBPS ] [ latency TIME ]\n");
+}
+
+static void explain1(char *arg)
+{
+ fprintf(stderr, "Illegal \"%s\"\n", arg);
+}
+
+
+#define usage() return(-1)
+
+static int tbf_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ int ok=0;
+ struct tc_tbf_qopt opt;
+ __u32 rtab[256];
+ __u32 ptab[256];
+ unsigned buffer=0, mtu=0, mpu=0, latency=0;
+ int Rcell_log=-1, Pcell_log = -1;
+ struct rtattr *tail;
+
+ memset(&opt, 0, sizeof(opt));
+
+ while (argc > 0) {
+ if (matches(*argv, "limit") == 0) {
+ NEXT_ARG();
+ if (opt.limit || latency) {
+ fprintf(stderr, "Double \"limit/latency\" spec\n");
+ return -1;
+ }
+ if (get_size(&opt.limit, *argv)) {
+ explain1("limit");
+ return -1;
+ }
+ ok++;
+ } else if (matches(*argv, "latency") == 0) {
+ NEXT_ARG();
+ if (opt.limit || latency) {
+ fprintf(stderr, "Double \"limit/latency\" spec\n");
+ return -1;
+ }
+ if (get_usecs(&latency, *argv)) {
+ explain1("latency");
+ return -1;
+ }
+ ok++;
+ } else if (matches(*argv, "burst") == 0 ||
+ strcmp(*argv, "buffer") == 0 ||
+ strcmp(*argv, "maxburst") == 0) {
+ NEXT_ARG();
+ if (buffer) {
+ fprintf(stderr, "Double \"buffer/burst\" spec\n");
+ return -1;
+ }
+ if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0) {
+ explain1("buffer");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "mtu") == 0 ||
+ strcmp(*argv, "minburst") == 0) {
+ NEXT_ARG();
+ if (mtu) {
+ fprintf(stderr, "Double \"mtu/minburst\" spec\n");
+ return -1;
+ }
+ if (get_size_and_cell(&mtu, &Pcell_log, *argv) < 0) {
+ explain1("mtu");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "mpu") == 0) {
+ NEXT_ARG();
+ if (mpu) {
+ fprintf(stderr, "Double \"mpu\" spec\n");
+ return -1;
+ }
+ if (get_size(&mpu, *argv)) {
+ explain1("mpu");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "rate") == 0) {
+ NEXT_ARG();
+ if (opt.rate.rate) {
+ fprintf(stderr, "Double \"rate\" spec\n");
+ return -1;
+ }
+ if (get_rate(&opt.rate.rate, *argv)) {
+ explain1("rate");
+ return -1;
+ }
+ ok++;
+ } else if (matches(*argv, "peakrate") == 0) {
+ NEXT_ARG();
+ if (opt.peakrate.rate) {
+ fprintf(stderr, "Double \"peakrate\" spec\n");
+ return -1;
+ }
+ if (get_rate(&opt.peakrate.rate, *argv)) {
+ explain1("peakrate");
+ return -1;
+ }
+ ok++;
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ if (!ok)
+ return 0;
+
+ if (opt.rate.rate == 0 || !buffer) {
+ fprintf(stderr, "Both \"rate\" and \"burst\" are required.\n");
+ return -1;
+ }
+ if (opt.peakrate.rate) {
+ if (!mtu) {
+ fprintf(stderr, "\"mtu\" is required, if \"peakrate\" is requested.\n");
+ return -1;
+ }
+ }
+
+ if (opt.limit == 0 && latency == 0) {
+ fprintf(stderr, "Either \"limit\" or \"latency\" are required.\n");
+ return -1;
+ }
+
+ if (opt.limit == 0) {
+ double lim = opt.rate.rate*(double)latency/1000000 + buffer;
+ if (opt.peakrate.rate) {
+ double lim2 = opt.peakrate.rate*(double)latency/1000000 + mtu;
+ if (lim2 < lim)
+ lim = lim2;
+ }
+ opt.limit = lim;
+ }
+
+ if ((Rcell_log = tc_calc_rtable(opt.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) {
+ fprintf(stderr, "TBF: failed to calculate rate table.\n");
+ return -1;
+ }
+ opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer);
+ opt.rate.cell_log = Rcell_log;
+ opt.rate.mpu = mpu;
+ if (opt.peakrate.rate) {
+ if ((Pcell_log = tc_calc_rtable(opt.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) {
+ fprintf(stderr, "TBF: failed to calculate peak rate table.\n");
+ return -1;
+ }
+ opt.mtu = tc_calc_xmittime(opt.peakrate.rate, mtu);
+ opt.peakrate.cell_log = Pcell_log;
+ opt.peakrate.mpu = mpu;
+ }
+
+ tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ addattr_l(n, 2024, TCA_TBF_PARMS, &opt, sizeof(opt));
+ addattr_l(n, 3024, TCA_TBF_RTAB, rtab, 1024);
+ if (opt.peakrate.rate)
+ addattr_l(n, 4096, TCA_TBF_PTAB, ptab, 1024);
+ tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail;
+ return 0;
+}
+
+static int tbf_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_TBF_PTAB+1];
+ struct tc_tbf_qopt *qopt;
+ double buffer, mtu;
+ double latency;
+ SPRINT_BUF(b1);
+ SPRINT_BUF(b2);
+
+ if (opt == NULL)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
+
+ if (tb[TCA_TBF_PARMS] == NULL)
+ return -1;
+
+ qopt = RTA_DATA(tb[TCA_TBF_PARMS]);
+ if (RTA_PAYLOAD(tb[TCA_TBF_PARMS]) < sizeof(*qopt))
+ return -1;
+ fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1));
+ buffer = ((double)qopt->rate.rate*tc_core_tick2usec(qopt->buffer))/1000000;
+ if (show_details) {
+ fprintf(f, "burst %s/%u mpu %s ", sprint_size(buffer, b1),
+ 1<<qopt->rate.cell_log, sprint_size(qopt->rate.mpu, b2));
+ } else {
+ fprintf(f, "burst %s ", sprint_size(buffer, b1));
+ }
+ if (show_raw)
+ fprintf(f, "[%08x] ", qopt->buffer);
+ if (qopt->peakrate.rate) {
+ fprintf(f, "peakrate %s ", sprint_rate(qopt->peakrate.rate, b1));
+ if (qopt->mtu || qopt->peakrate.mpu) {
+ mtu = ((double)qopt->peakrate.rate*tc_core_tick2usec(qopt->mtu))/1000000;
+ if (show_details) {
+ fprintf(f, "mtu %s/%u mpu %s ", sprint_size(mtu, b1),
+ 1<<qopt->peakrate.cell_log, sprint_size(qopt->peakrate.mpu, b2));
+ } else {
+ fprintf(f, "minburst %s ", sprint_size(mtu, b1));
+ }
+ if (show_raw)
+ fprintf(f, "[%08x] ", qopt->mtu);
+ }
+ }
+
+ if (show_raw)
+ fprintf(f, "limit %s ", sprint_size(qopt->limit, b1));
+
+ latency = 1000000*(qopt->limit/(double)qopt->rate.rate) - tc_core_tick2usec(qopt->buffer);
+ if (qopt->peakrate.rate) {
+ double lat2 = 1000000*(qopt->limit/(double)qopt->peakrate.rate) - tc_core_tick2usec(qopt->mtu);
+ if (lat2 > latency)
+ latency = lat2;
+ }
+ fprintf(f, "lat %s ", sprint_usecs(tc_core_tick2usec(latency), b1));
+
+ return 0;
+}
+
+static int tbf_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats)
+{
+ return 0;
+}
+
+struct qdisc_util tbf_util = {
+ NULL,
+ "tbf",
+ tbf_parse_opt,
+ tbf_print_opt,
+ tbf_print_xstats,
+};
+
diff --git a/tc/tc.c b/tc/tc.c
index e69de29b..35b3a95c 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -0,0 +1,306 @@
+/*
+ * tc.c "tc" utility frontend.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Petri Mattila <petri@prihateam.fi> 990308: wrong memset's resulted in faults
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <dlfcn.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <errno.h>
+
+#include "SNAPSHOT.h"
+#include "utils.h"
+#include "tc_util.h"
+#include "tc_common.h"
+
+int show_stats = 0;
+int show_details = 0;
+int show_raw = 0;
+int resolve_hosts = 0;
+
+void *BODY;
+static struct qdisc_util * qdisc_list;
+static struct filter_util * filter_list;
+
+static int print_noqopt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ if (opt && RTA_PAYLOAD(opt))
+ fprintf(f, "[Unknown qdisc, optlen=%u] ", RTA_PAYLOAD(opt));
+ return 0;
+}
+
+static int parse_noqopt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+ if (argc) {
+ fprintf(stderr, "Unknown qdisc \"%s\", hence option \"%s\" is unparsable\n", qu->id, *argv);
+ return -1;
+ }
+ return 0;
+}
+
+static int print_nofopt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 fhandle)
+{
+ if (opt && RTA_PAYLOAD(opt))
+ fprintf(f, "fh %08x [Unknown filter, optlen=%u] ", fhandle, RTA_PAYLOAD(opt));
+ else if (fhandle)
+ fprintf(f, "fh %08x ", fhandle);
+ return 0;
+}
+
+static int parse_nofopt(struct filter_util *qu, char *fhandle, int argc, char **argv, struct nlmsghdr *n)
+{
+ __u32 handle;
+
+ if (argc) {
+ fprintf(stderr, "Unknown filter \"%s\", hence option \"%s\" is unparsable\n", qu->id, *argv);
+ return -1;
+ }
+ if (fhandle) {
+ struct tcmsg *t = NLMSG_DATA(n);
+ if (get_u32(&handle, fhandle, 16)) {
+ fprintf(stderr, "Unparsable filter ID \"%s\"\n", fhandle);
+ return -1;
+ }
+ t->tcm_handle = handle;
+ }
+ return 0;
+}
+
+#if 0
+/* Builtin filter types */
+
+static int f_parse_noopt(struct filter_util *qu, char *fhandle, int argc, char **argv, struct nlmsghdr *n)
+{
+ if (argc || fhandle) {
+ fprintf(stderr, "Filter \"%s\" has no options.\n", qu->id);
+ return -1;
+ }
+ return 0;
+}
+#endif
+
+struct qdisc_util *get_qdisc_kind(char *str)
+{
+ void *dlh;
+ char buf[256];
+ struct qdisc_util *q;
+
+ for (q = qdisc_list; q; q = q->next)
+ if (strcmp(q->id, str) == 0)
+ return q;
+
+ snprintf(buf, sizeof(buf), "q_%s.so", str);
+ dlh = dlopen(buf, RTLD_LAZY);
+ if (dlh == NULL) {
+ dlh = BODY;
+ if (dlh == NULL) {
+ dlh = BODY = dlopen(NULL, RTLD_LAZY);
+ if (dlh == NULL)
+ goto noexist;
+ }
+ }
+
+ snprintf(buf, sizeof(buf), "%s_util", str);
+ q = dlsym(dlh, buf);
+ if (q == NULL)
+ goto noexist;
+
+reg:
+ q->next = qdisc_list;
+ qdisc_list = q;
+ return q;
+
+noexist:
+ q = malloc(sizeof(*q));
+ if (q) {
+ memset(q, 0, sizeof(*q));
+ strncpy(q->id, str, 15);
+ q->parse_qopt = parse_noqopt;
+ q->print_qopt = print_noqopt;
+ goto reg;
+ }
+ return q;
+}
+
+
+struct filter_util *get_filter_kind(char *str)
+{
+ void *dlh;
+ char buf[256];
+ struct filter_util *q;
+
+ for (q = filter_list; q; q = q->next)
+ if (strcmp(q->id, str) == 0)
+ return q;
+
+ snprintf(buf, sizeof(buf), "f_%s.so", str);
+ dlh = dlopen(buf, RTLD_LAZY);
+ if (dlh == NULL) {
+ dlh = BODY;
+ if (dlh == NULL) {
+ dlh = BODY = dlopen(NULL, RTLD_LAZY);
+ if (dlh == NULL)
+ goto noexist;
+ }
+ }
+
+ snprintf(buf, sizeof(buf), "%s_util", str);
+ q = dlsym(dlh, buf);
+ if (q == NULL)
+ goto noexist;
+
+reg:
+ q->next = filter_list;
+ filter_list = q;
+ return q;
+
+noexist:
+ q = malloc(sizeof(*q));
+ if (q) {
+ memset(q, 0, sizeof(*q));
+ strncpy(q->id, str, 15);
+ q->parse_fopt = parse_nofopt;
+ q->print_fopt = print_nofopt;
+ goto reg;
+ }
+ return q;
+}
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT { COMMAND | help }\n"
+ "where OBJECT := { qdisc | class | filter }\n"
+ " OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -b[atch] file }\n");
+ exit(-1);
+}
+
+
+
+int main(int argc, char **argv)
+{
+ char *basename;
+
+ basename = strrchr(argv[0], '/');
+ if (basename == NULL)
+ basename = argv[0];
+ else
+ basename++;
+
+
+ /* batch mode */
+ if (argc > 1 && matches(argv[1], "-batch") == 0) {
+ FILE *batch;
+ char line[400];
+ char *largv[100];
+ int largc, ret=0;
+#define BMAXARG (sizeof(largv)/sizeof(char *)-2)
+
+ if (argc != 3) {
+ fprintf(stderr, "Wrong number of arguments in batch mode\n");
+ exit(-1);
+ }
+ if (matches(argv[2], "-") != 0) {
+ if ((batch = fopen(argv[2], "r")) == NULL) {
+ fprintf(stderr, "Cannot open file \"%s\" for reading: %s=n", argv[2], strerror(errno));
+ exit(-1);
+ }
+ } else {
+ if ((batch = fdopen(0, "r")) == NULL) {
+ fprintf(stderr, "Cannot open stdin for reading: %s=n", strerror(errno));
+ exit(-1);
+ }
+ }
+
+ tc_core_init();
+
+ while (fgets(line, sizeof(line)-1, batch)) {
+ if (line[strlen(line)-1]=='\n') {
+ line[strlen(line)-1] = '\0';
+ } else {
+ fprintf(stderr, "No newline at the end of line, looks like to long (%d chars or more)\n", strlen(line));
+ exit(-1);
+ }
+ largc = 0;
+ largv[largc]=strtok(line, " ");
+ while ((largv[++largc]=strtok(NULL, " ")) != NULL) {
+ if (largc > BMAXARG) {
+ fprintf(stderr, "Over %d arguments in batch mode, enough!\n", BMAXARG);
+ exit(-1);
+ }
+ }
+
+ if (matches(largv[0], "qdisc") == 0) {
+ ret += do_qdisc(largc-1, largv+1);
+ } else if (matches(largv[0], "class") == 0) {
+ ret += do_class(largc-1, largv+1);
+ } else if (matches(largv[0], "filter") == 0) {
+ ret += do_filter(largc-1, largv+1);
+ } else if (matches(largv[0], "help") == 0) {
+ usage(); /* note that usage() doesn't return */
+ } else {
+ fprintf(stderr, "Object \"%s\" is unknown, try \"tc help\".\n", largv[1]);
+ exit(-1);
+ }
+ }
+ fclose(batch);
+ exit(0); /* end of batch, that's all */
+ }
+
+ while (argc > 1) {
+ if (argv[1][0] != '-')
+ break;
+ if (matches(argv[1], "-stats") == 0 ||
+ matches(argv[1], "-statistics") == 0) {
+ ++show_stats;
+ } else if (matches(argv[1], "-details") == 0) {
+ ++show_details;
+ } else if (matches(argv[1], "-raw") == 0) {
+ ++show_raw;
+ } else if (matches(argv[1], "-Version") == 0) {
+ printf("tc utility, iproute2-ss%s\n", SNAPSHOT);
+ exit(0);
+ } else if (matches(argv[1], "-help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, "Option \"%s\" is unknown, try \"tc -help\".\n", argv[1]);
+ exit(-1);
+ }
+ argc--; argv++;
+ }
+
+ tc_core_init();
+
+ if (argc > 1) {
+ if (matches(argv[1], "qdisc") == 0)
+ return do_qdisc(argc-2, argv+2);
+ if (matches(argv[1], "class") == 0)
+ return do_class(argc-2, argv+2);
+ if (matches(argv[1], "filter") == 0)
+ return do_filter(argc-2, argv+2);
+ if (matches(argv[1], "help") == 0)
+ usage();
+ fprintf(stderr, "Object \"%s\" is unknown, try \"tc help\".\n", argv[1]);
+ exit(-1);
+ }
+
+ usage();
+}
diff --git a/tc/tc_cbq.c b/tc/tc_cbq.c
index e69de29b..0abcc9da 100644
--- a/tc/tc_cbq.c
+++ b/tc/tc_cbq.c
@@ -0,0 +1,57 @@
+/*
+ * tc_cbq.c CBQ maintanance routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <math.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "tc_core.h"
+#include "tc_cbq.h"
+
+unsigned tc_cbq_calc_maxidle(unsigned bndw, unsigned rate, unsigned avpkt,
+ int ewma_log, unsigned maxburst)
+{
+ double maxidle;
+ double g = 1.0 - 1.0/(1<<ewma_log);
+ double xmt = (double)avpkt/bndw;
+
+ maxidle = xmt*(1-g);
+ if (bndw != rate && maxburst) {
+ double vxmt = (double)avpkt/rate - xmt;
+ vxmt *= (pow(g, -(double)maxburst) - 1);
+ if (vxmt > maxidle)
+ maxidle = vxmt;
+ }
+ return tc_core_usec2tick(maxidle*(1<<ewma_log)*1000000);
+}
+
+unsigned tc_cbq_calc_offtime(unsigned bndw, unsigned rate, unsigned avpkt,
+ int ewma_log, unsigned minburst)
+{
+ double g = 1.0 - 1.0/(1<<ewma_log);
+ double offtime = (double)avpkt/rate - (double)avpkt/bndw;
+
+ if (minburst == 0)
+ return 0;
+ if (minburst == 1)
+ offtime *= pow(g, -(double)minburst) - 1;
+ else
+ offtime *= 1 + (pow(g, -(double)(minburst-1)) - 1)/(1-g);
+ return tc_core_usec2tick(offtime*1000000);
+}
diff --git a/tc/tc_cbq.h b/tc/tc_cbq.h
index e69de29b..8f956490 100644
--- a/tc/tc_cbq.h
+++ b/tc/tc_cbq.h
@@ -0,0 +1,9 @@
+#ifndef _TC_CBQ_H_
+#define _TC_CBQ_H_ 1
+
+unsigned tc_cbq_calc_maxidle(unsigned bndw, unsigned rate, unsigned avpkt,
+ int ewma_log, unsigned maxburst);
+unsigned tc_cbq_calc_offtime(unsigned bndw, unsigned rate, unsigned avpkt,
+ int ewma_log, unsigned minburst);
+
+#endif
diff --git a/tc/tc_class.c b/tc/tc_class.c
index e69de29b..542f8d5f 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -0,0 +1,361 @@
+/*
+ * tc_class.c "tc class".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <math.h>
+
+#include "utils.h"
+#include "tc_util.h"
+#include "tc_common.h"
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: tc class [ add | del | change | get ] dev STRING\n");
+ fprintf(stderr, " [ classid CLASSID ] [ root | parent CLASSID ]\n");
+ fprintf(stderr, " [ [ QDISC_KIND ] [ help | OPTIONS ] ]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " tc class show [ dev STRING ] [ root | parent CLASSID ]\n");
+ fprintf(stderr, "Where:\n");
+ fprintf(stderr, "QDISC_KIND := { prio | cbq | etc. }\n");
+ fprintf(stderr, "OPTIONS := ... try tc class add <desired QDISC_KIND> help\n");
+ exit(-1);
+}
+
+int tc_class_modify(int cmd, unsigned flags, int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct tcmsg t;
+ char buf[4096];
+ } req;
+ struct qdisc_util *q = NULL;
+ struct tc_estimator est;
+ char d[16];
+ char k[16];
+
+ memset(&req, 0, sizeof(req));
+ memset(&est, 0, sizeof(est));
+ memset(d, 0, sizeof(d));
+ memset(k, 0, sizeof(k));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+ req.n.nlmsg_type = cmd;
+ req.t.tcm_family = AF_UNSPEC;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (d[0])
+ duparg("dev", *argv);
+ strncpy(d, *argv, sizeof(d)-1);
+ } else if (strcmp(*argv, "classid") == 0) {
+ __u32 handle;
+ NEXT_ARG();
+ if (req.t.tcm_handle)
+ duparg("classid", *argv);
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "invalid class ID");
+ req.t.tcm_handle = handle;
+ } else if (strcmp(*argv, "root") == 0) {
+ if (req.t.tcm_parent) {
+ fprintf(stderr, "Error: \"root\" is duplicate parent ID.\n");
+ exit(-1);
+ }
+ req.t.tcm_parent = TC_H_ROOT;
+ } else if (strcmp(*argv, "parent") == 0) {
+ __u32 handle;
+ NEXT_ARG();
+ if (req.t.tcm_parent)
+ duparg("parent", *argv);
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "invalid parent ID");
+ req.t.tcm_parent = handle;
+ } else if (matches(*argv, "estimator") == 0) {
+ if (parse_estimator(&argc, &argv, &est))
+ return -1;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ strncpy(k, *argv, sizeof(k)-1);
+
+ q = get_qdisc_kind(k);
+ argc--; argv++;
+ break;
+ }
+ argc--; argv++;
+ }
+
+ if (k[0])
+ addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+ if (est.ewma_log)
+ addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
+
+ if (q) {
+ if (q->parse_copt == NULL) {
+ fprintf(stderr, "Error: Qdisc \"%s\" is classless.\n", k);
+ exit(1);
+ }
+ if (q->parse_copt(q, argc, argv, &req.n))
+ exit(1);
+ } else {
+ if (argc) {
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc class help\".", *argv);
+ exit(-1);
+ }
+ }
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ if (d[0]) {
+ ll_init_map(&rth);
+
+ if ((req.t.tcm_ifindex = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ exit(1);
+ }
+ }
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ exit(2);
+
+ rtnl_close(&rth);
+ return 0;
+}
+
+void print_class_tcstats(FILE *fp, struct tc_stats *st)
+{
+ SPRINT_BUF(b1);
+
+ fprintf(fp, " Sent %llu bytes %u pkts (dropped %u, overlimits %u) ",
+ (unsigned long long)st->bytes, st->packets, st->drops, st->overlimits);
+ if (st->bps || st->pps || st->qlen || st->backlog) {
+ fprintf(fp, "\n ");
+ if (st->bps || st->pps) {
+ fprintf(fp, "rate ");
+ if (st->bps)
+ fprintf(fp, "%s ", sprint_rate(st->bps, b1));
+ if (st->pps)
+ fprintf(fp, "%upps ", st->pps);
+ }
+ if (st->qlen || st->backlog) {
+ fprintf(fp, "backlog ");
+ if (st->backlog)
+ fprintf(fp, "%s ", sprint_size(st->backlog, b1));
+ if (st->qlen)
+ fprintf(fp, "%up ", st->qlen);
+ }
+ }
+}
+
+int filter_ifindex;
+__u32 filter_qdisc;
+
+int print_class(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct tcmsg *t = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * tb[TCA_MAX+1];
+ struct qdisc_util *q;
+ char abuf[256];
+
+ if (n->nlmsg_type != RTM_NEWTCLASS && n->nlmsg_type != RTM_DELTCLASS) {
+ fprintf(stderr, "Not a class\n");
+ return 0;
+ }
+ len -= NLMSG_LENGTH(sizeof(*t));
+ if (len < 0) {
+ fprintf(stderr, "Wrong len %d\n", len);
+ return -1;
+ }
+ if (filter_qdisc && TC_H_MAJ(t->tcm_handle^filter_qdisc))
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len);
+
+ if (tb[TCA_KIND] == NULL) {
+ fprintf(stderr, "NULL kind\n");
+ return -1;
+ }
+
+ if (n->nlmsg_type == RTM_DELTCLASS)
+ fprintf(fp, "deleted ");
+
+ abuf[0] = 0;
+ if (t->tcm_handle) {
+ if (filter_qdisc)
+ print_tc_classid(abuf, sizeof(abuf), TC_H_MIN(t->tcm_handle));
+ else
+ print_tc_classid(abuf, sizeof(abuf), t->tcm_handle);
+ }
+ fprintf(fp, "class %s %s ", (char*)RTA_DATA(tb[TCA_KIND]), abuf);
+
+ if (filter_ifindex == 0)
+ fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex));
+
+ if (t->tcm_parent == TC_H_ROOT)
+ fprintf(fp, "root ");
+ else {
+ if (filter_qdisc)
+ print_tc_classid(abuf, sizeof(abuf), TC_H_MIN(t->tcm_parent));
+ else
+ print_tc_classid(abuf, sizeof(abuf), t->tcm_parent);
+ fprintf(fp, "parent %s ", abuf);
+ }
+ if (t->tcm_info)
+ fprintf(fp, "leaf %x: ", t->tcm_info>>16);
+ q = get_qdisc_kind(RTA_DATA(tb[TCA_KIND]));
+ if (tb[TCA_OPTIONS]) {
+ if (q && q->print_copt)
+ q->print_copt(q, fp, tb[TCA_OPTIONS]);
+ else
+ fprintf(fp, "[cannot parse class parameters]");
+ }
+ fprintf(fp, "\n");
+ if (show_stats) {
+ if (tb[TCA_STATS]) {
+ if (RTA_PAYLOAD(tb[TCA_STATS]) < sizeof(struct tc_stats))
+ fprintf(fp, "statistics truncated");
+ else {
+ struct tc_stats st;
+ memcpy(&st, RTA_DATA(tb[TCA_STATS]), sizeof(st));
+ print_class_tcstats(fp, &st);
+ fprintf(fp, "\n");
+ }
+ }
+ if (q && tb[TCA_XSTATS]) {
+ q->print_xstats(q, fp, tb[TCA_XSTATS]);
+ fprintf(fp, "\n");
+ }
+ }
+ fflush(fp);
+ return 0;
+}
+
+
+int tc_class_list(int argc, char **argv)
+{
+ struct tcmsg t;
+ struct rtnl_handle rth;
+ char d[16];
+
+ memset(&t, 0, sizeof(t));
+ t.tcm_family = AF_UNSPEC;
+ memset(d, 0, sizeof(d));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (d[0])
+ duparg("dev", *argv);
+ strncpy(d, *argv, sizeof(d)-1);
+ } else if (strcmp(*argv, "qdisc") == 0) {
+ NEXT_ARG();
+ if (filter_qdisc)
+ duparg("qdisc", *argv);
+ if (get_qdisc_handle(&filter_qdisc, *argv))
+ invarg(*argv, "invalid qdisc ID");
+ } else if (strcmp(*argv, "root") == 0) {
+ if (t.tcm_parent) {
+ fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
+ exit(-1);
+ }
+ t.tcm_parent = TC_H_ROOT;
+ } else if (strcmp(*argv, "parent") == 0) {
+ __u32 handle;
+ if (t.tcm_parent)
+ duparg("parent", *argv);
+ NEXT_ARG();
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "invalid parent ID");
+ t.tcm_parent = handle;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, "What is \"%s\"? Try \"tc class help\".\n", *argv);
+ exit(-1);
+ }
+
+ argc--; argv++;
+ }
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ ll_init_map(&rth);
+
+ if (d[0]) {
+ if ((t.tcm_ifindex = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ exit(1);
+ }
+ filter_ifindex = t.tcm_ifindex;
+ }
+
+ if (rtnl_dump_request(&rth, RTM_GETTCLASS, &t, sizeof(t)) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, print_class, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ rtnl_close(&rth);
+ return 0;
+}
+
+int do_class(int argc, char **argv)
+{
+ if (argc < 1)
+ return tc_class_list(0, NULL);
+ if (matches(*argv, "add") == 0)
+ return tc_class_modify(RTM_NEWTCLASS, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
+ if (matches(*argv, "change") == 0)
+ return tc_class_modify(RTM_NEWTCLASS, 0, argc-1, argv+1);
+ if (matches(*argv, "replace") == 0)
+ return tc_class_modify(RTM_NEWTCLASS, NLM_F_CREATE, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return tc_class_modify(RTM_DELTCLASS, 0, argc-1, argv+1);
+#if 0
+ if (matches(*argv, "get") == 0)
+ return tc_class_get(RTM_GETTCLASS, 0, argc-1, argv+1);
+#endif
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return tc_class_list(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"tc class help\".\n", *argv);
+ return -1;
+}
diff --git a/tc/tc_common.h b/tc/tc_common.h
index e69de29b..d695ca2d 100644
--- a/tc/tc_common.h
+++ b/tc/tc_common.h
@@ -0,0 +1,5 @@
+extern int do_qdisc(int argc, char **argv);
+extern int do_class(int argc, char **argv);
+extern int do_filter(int argc, char **argv);
+
+extern int parse_estimator(int *p_argc, char ***p_argv, struct tc_estimator *est);
diff --git a/tc/tc_core.c b/tc/tc_core.c
index e69de29b..55586741 100644
--- a/tc/tc_core.c
+++ b/tc/tc_core.c
@@ -0,0 +1,85 @@
+/*
+ * tc_core.c TC core library.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <math.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "tc_core.h"
+
+static __u32 t2us=1;
+static __u32 us2t=1;
+static double tick_in_usec = 1;
+
+long tc_core_usec2tick(long usec)
+{
+ return usec*tick_in_usec;
+}
+
+long tc_core_tick2usec(long tick)
+{
+ return tick/tick_in_usec;
+}
+
+unsigned tc_calc_xmittime(unsigned rate, unsigned size)
+{
+ return tc_core_usec2tick(1000000*((double)size/rate));
+}
+
+/*
+ rtab[pkt_len>>cell_log] = pkt_xmit_time
+ */
+
+int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu,
+ unsigned mpu)
+{
+ int i;
+
+ if (mtu == 0)
+ mtu = 2047;
+
+ if (cell_log < 0) {
+ cell_log = 0;
+ while ((mtu>>cell_log) > 255)
+ cell_log++;
+ }
+ for (i=0; i<256; i++) {
+ unsigned sz = (i<<cell_log);
+ if (sz < mpu)
+ sz = mpu;
+ rtab[i] = tc_core_usec2tick(1000000*((double)sz/bps));
+ }
+ return cell_log;
+}
+
+int tc_core_init()
+{
+ FILE *fp = fopen("/proc/net/psched", "r");
+
+ if (fp == NULL)
+ return -1;
+
+ if (fscanf(fp, "%08x%08x", &t2us, &us2t) != 2) {
+ fclose(fp);
+ return -1;
+ }
+ fclose(fp);
+ tick_in_usec = (double)t2us/us2t;
+ return 0;
+}
diff --git a/tc/tc_core.h b/tc/tc_core.h
index e69de29b..1d2257ee 100644
--- a/tc/tc_core.h
+++ b/tc/tc_core.h
@@ -0,0 +1,16 @@
+#ifndef _TC_CORE_H_
+#define _TC_CORE_H_ 1
+
+#include <asm/types.h>
+#include <linux/pkt_sched.h>
+
+long tc_core_usec2tick(long usec);
+long tc_core_tick2usec(long tick);
+unsigned tc_calc_xmittime(unsigned rate, unsigned size);
+int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, unsigned mpu);
+
+int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est);
+
+int tc_core_init(void);
+
+#endif
diff --git a/tc/tc_estimator.c b/tc/tc_estimator.c
index e69de29b..434db0fe 100644
--- a/tc/tc_estimator.c
+++ b/tc/tc_estimator.c
@@ -0,0 +1,44 @@
+/*
+ * tc_core.c TC core library.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <math.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "tc_core.h"
+
+int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est)
+{
+ for (est->interval=0; est->interval<=5; est->interval++) {
+ if (A <= (1<<est->interval)*(1000000/4))
+ break;
+ }
+ if (est->interval > 5)
+ return -1;
+ est->interval -= 2;
+ for (est->ewma_log=1; est->ewma_log<32; est->ewma_log++) {
+ double w = 1.0 - 1.0/(1<<est->ewma_log);
+ if (A/(-log(w)) > time_const)
+ break;
+ }
+ est->ewma_log--;
+ if (est->ewma_log==0 || est->ewma_log >= 31)
+ return -1;
+ return 0;
+}
diff --git a/tc/tc_filter.c b/tc/tc_filter.c
index e69de29b..300c3e70 100644
--- a/tc/tc_filter.c
+++ b/tc/tc_filter.c
@@ -0,0 +1,388 @@
+/*
+ * tc_filter.c "tc filter".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <linux/if_ether.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "tc_util.h"
+#include "tc_common.h"
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: tc filter [ add | del | change | get ] dev STRING\n");
+ fprintf(stderr, " [ pref PRIO ] [ protocol PROTO ]\n");
+ fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n");
+ fprintf(stderr, " [ root | classid CLASSID ] [ handle FILTERID ]\n");
+ fprintf(stderr, " [ [ FILTER_TYPE ] [ help | OPTIONS ] ]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " tc filter show [ dev STRING ] [ root | parent CLASSID ]\n");
+ fprintf(stderr, "Where:\n");
+ fprintf(stderr, "FILTER_TYPE := { rsvp | u32 | fw | route | etc. }\n");
+ fprintf(stderr, "FILTERID := ... format depends on classifier, see there\n");
+ fprintf(stderr, "OPTIONS := ... try tc filter add <desired FILTER_KIND> help\n");
+ exit(-1);
+}
+
+
+int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct tcmsg t;
+ char buf[4096];
+ } req;
+ struct filter_util *q = NULL;
+ __u32 prio = 0;
+ __u32 protocol = 0;
+ char *fhandle = NULL;
+ char d[16];
+ char k[16];
+ struct tc_estimator est;
+
+ memset(&req, 0, sizeof(req));
+ memset(&est, 0, sizeof(est));
+ memset(d, 0, sizeof(d));
+ memset(k, 0, sizeof(k));
+ memset(&req, 0, sizeof(req));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+ req.n.nlmsg_type = cmd;
+ req.t.tcm_family = AF_UNSPEC;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (d[0])
+ duparg("dev", *argv);
+ strncpy(d, *argv, sizeof(d)-1);
+ } else if (strcmp(*argv, "root") == 0) {
+ if (req.t.tcm_parent) {
+ fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
+ exit(-1);
+ }
+ req.t.tcm_parent = TC_H_ROOT;
+ } else if (strcmp(*argv, "parent") == 0) {
+ __u32 handle;
+ NEXT_ARG();
+ if (req.t.tcm_parent)
+ duparg("parent", *argv);
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "Invalid parent ID");
+ req.t.tcm_parent = handle;
+ } else if (strcmp(*argv, "handle") == 0) {
+ NEXT_ARG();
+ if (fhandle)
+ duparg("handle", *argv);
+ fhandle = *argv;
+ } else if (matches(*argv, "preference") == 0 ||
+ matches(*argv, "priority") == 0) {
+ NEXT_ARG();
+ if (prio)
+ duparg("priority", *argv);
+ if (get_u32(&prio, *argv, 0))
+ invarg(*argv, "invalid prpriority value");
+ } else if (matches(*argv, "protocol") == 0) {
+ __u16 id;
+ NEXT_ARG();
+ if (protocol)
+ duparg("protocol", *argv);
+ if (ll_proto_a2n(&id, *argv))
+ invarg(*argv, "invalid protocol");
+ protocol = id;
+ } else if (matches(*argv, "estimator") == 0) {
+ if (parse_estimator(&argc, &argv, &est) < 0)
+ return -1;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ strncpy(k, *argv, sizeof(k)-1);
+
+ q = get_filter_kind(k);
+ argc--; argv++;
+ break;
+ }
+
+ argc--; argv++;
+ }
+
+ req.t.tcm_info = TC_H_MAKE(prio<<16, protocol);
+
+ if (k[0])
+ addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+
+ if (q) {
+ if (q->parse_fopt(q, fhandle, argc, argv, &req.n))
+ exit(1);
+ } else {
+ if (fhandle) {
+ fprintf(stderr, "Must specify filter type when using "
+ "\"handle\"\n");
+ exit(-1);
+ }
+ if (argc) {
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc filter help\".\n", *argv);
+ exit(-1);
+ }
+ }
+ if (est.ewma_log)
+ addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
+
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ if (d[0]) {
+ ll_init_map(&rth);
+
+ if ((req.t.tcm_ifindex = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ exit(1);
+ }
+ }
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ exit(2);
+
+ rtnl_close(&rth);
+ return 0;
+}
+
+static __u32 filter_parent;
+static int filter_ifindex;
+static __u32 filter_prio;
+static __u32 filter_protocol;
+
+int print_filter(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct tcmsg *t = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * tb[TCA_MAX+1];
+ struct filter_util *q;
+ char abuf[256];
+
+ if (n->nlmsg_type != RTM_NEWTFILTER && n->nlmsg_type != RTM_DELTFILTER) {
+ fprintf(stderr, "Not a filter\n");
+ return 0;
+ }
+ len -= NLMSG_LENGTH(sizeof(*t));
+ if (len < 0) {
+ fprintf(stderr, "Wrong len %d\n", len);
+ return -1;
+ }
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len);
+
+ if (tb[TCA_KIND] == NULL) {
+ fprintf(stderr, "NULL kind\n");
+ return -1;
+ }
+
+ if (n->nlmsg_type == RTM_DELTFILTER)
+ fprintf(fp, "deleted ");
+
+ fprintf(fp, "filter ");
+ if (!filter_ifindex || filter_ifindex != t->tcm_ifindex)
+ fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex));
+
+ if (!filter_parent || filter_parent != t->tcm_parent) {
+ if (t->tcm_parent == TC_H_ROOT)
+ fprintf(fp, "root ");
+ else {
+ print_tc_classid(abuf, sizeof(abuf), t->tcm_parent);
+ fprintf(fp, "parent %s ", abuf);
+ }
+ }
+ if (t->tcm_info) {
+ __u32 protocol = TC_H_MIN(t->tcm_info);
+ __u32 prio = TC_H_MAJ(t->tcm_info)>>16;
+ if (!filter_protocol || filter_protocol != protocol) {
+ if (protocol) {
+ SPRINT_BUF(b1);
+ fprintf(fp, "protocol %s ",
+ ll_proto_n2a(protocol, b1, sizeof(b1)));
+ }
+ }
+ if (!filter_prio || filter_prio != prio) {
+ if (prio)
+ fprintf(fp, "pref %u ", prio);
+ }
+ }
+ fprintf(fp, "%s ", (char*)RTA_DATA(tb[TCA_KIND]));
+ q = get_filter_kind(RTA_DATA(tb[TCA_KIND]));
+ if (tb[TCA_OPTIONS]) {
+ if (q)
+ q->print_fopt(q, fp, tb[TCA_OPTIONS], t->tcm_handle);
+ else
+ fprintf(fp, "[cannot parse parameters]");
+ }
+ fprintf(fp, "\n");
+
+ if (show_stats) {
+ if (tb[TCA_STATS]) {
+ if (RTA_PAYLOAD(tb[TCA_STATS]) < sizeof(struct tc_stats))
+ fprintf(fp, "statistics truncated");
+ else {
+ struct tc_stats st;
+ memcpy(&st, RTA_DATA(tb[TCA_STATS]), sizeof(st));
+ print_tcstats(fp, &st);
+ fprintf(fp, "\n");
+ }
+ }
+ }
+ fflush(fp);
+ return 0;
+}
+
+
+int tc_filter_list(int argc, char **argv)
+{
+ struct tcmsg t;
+ struct rtnl_handle rth;
+ char d[16];
+ __u32 prio = 0;
+ __u32 protocol = 0;
+ char *fhandle = NULL;
+
+ memset(&t, 0, sizeof(t));
+ t.tcm_family = AF_UNSPEC;
+ memset(d, 0, sizeof(d));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (d[0])
+ duparg("dev", *argv);
+ strncpy(d, *argv, sizeof(d)-1);
+ } else if (strcmp(*argv, "root") == 0) {
+ if (t.tcm_parent) {
+ fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
+ exit(-1);
+ }
+ filter_parent = t.tcm_parent = TC_H_ROOT;
+ } else if (strcmp(*argv, "parent") == 0) {
+ __u32 handle;
+ NEXT_ARG();
+ if (t.tcm_parent)
+ duparg("parent", *argv);
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "invalid parent ID");
+ filter_parent = t.tcm_parent = handle;
+ } else if (strcmp(*argv, "handle") == 0) {
+ NEXT_ARG();
+ if (fhandle)
+ duparg("handle", *argv);
+ fhandle = *argv;
+ } else if (matches(*argv, "preference") == 0 ||
+ matches(*argv, "priority") == 0) {
+ NEXT_ARG();
+ if (prio)
+ duparg("priority", *argv);
+ if (get_u32(&prio, *argv, 0))
+ invarg(*argv, "invalid preference");
+ filter_prio = prio;
+ } else if (matches(*argv, "protocol") == 0) {
+ __u16 res;
+ NEXT_ARG();
+ if (protocol)
+ duparg("protocol", *argv);
+ if (ll_proto_a2n(&res, *argv))
+ invarg(*argv, "invalid protocol");
+ protocol = res;
+ filter_protocol = protocol;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, " What is \"%s\"? Try \"tc filter help\"\n", *argv);
+ exit(-1);
+ }
+
+ argc--; argv++;
+ }
+
+ t.tcm_info = TC_H_MAKE(prio<<16, protocol);
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ ll_init_map(&rth);
+
+ if (d[0]) {
+ if ((t.tcm_ifindex = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ exit(1);
+ }
+ filter_ifindex = t.tcm_ifindex;
+ }
+
+ if (rtnl_dump_request(&rth, RTM_GETTFILTER, &t, sizeof(t)) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, print_filter, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ rtnl_close(&rth);
+ return 0;
+}
+
+int do_filter(int argc, char **argv)
+{
+ if (argc < 1)
+ return tc_filter_list(0, NULL);
+ if (matches(*argv, "add") == 0)
+ return tc_filter_modify(RTM_NEWTFILTER, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
+ if (matches(*argv, "change") == 0)
+ return tc_filter_modify(RTM_NEWTFILTER, 0, argc-1, argv+1);
+ if (matches(*argv, "replace") == 0)
+ return tc_filter_modify(RTM_NEWTFILTER, NLM_F_CREATE, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return tc_filter_modify(RTM_DELTFILTER, 0, argc-1, argv+1);
+#if 0
+ if (matches(*argv, "get") == 0)
+ return tc_filter_get(RTM_GETTFILTER, 0, argc-1, argv+1);
+#endif
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return tc_filter_list(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"tc filter help\".\n", *argv);
+ exit(-1);
+}
+
diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c
index e69de29b..361ca8aa 100644
--- a/tc/tc_qdisc.c
+++ b/tc/tc_qdisc.c
@@ -0,0 +1,353 @@
+/*
+ * tc_qdisc.c "tc qdisc".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * J Hadi Salim: Extension to ingress
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <math.h>
+
+#include "utils.h"
+#include "tc_util.h"
+#include "tc_common.h"
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: tc qdisc [ add | del | replace | change | get ] dev STRING\n");
+ fprintf(stderr, " [ handle QHANDLE ] [ root | ingress | parent CLASSID ]\n");
+ fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n");
+ fprintf(stderr, " [ [ QDISC_KIND ] [ help | OPTIONS ] ]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " tc qdisc show [ dev STRING ] [ingress]\n");
+ fprintf(stderr, "Where:\n");
+ fprintf(stderr, "QDISC_KIND := { [p|b]fifo | tbf | prio | cbq | red | etc. }\n");
+ fprintf(stderr, "OPTIONS := ... try tc qdisc add <desired QDISC_KIND> help\n");
+ exit(-1);
+}
+
+int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv)
+{
+ struct rtnl_handle rth;
+ struct {
+ struct nlmsghdr n;
+ struct tcmsg t;
+ char buf[4096];
+ } req;
+ struct qdisc_util *q = NULL;
+ struct tc_estimator est;
+ char d[16];
+ char k[16];
+
+ memset(&req, 0, sizeof(req));
+ memset(&est, 0, sizeof(est));
+ memset(&d, 0, sizeof(d));
+ memset(&k, 0, sizeof(k));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+ req.n.nlmsg_type = cmd;
+ req.t.tcm_family = AF_UNSPEC;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (d[0])
+ duparg("dev", *argv);
+ strncpy(d, *argv, sizeof(d)-1);
+ } else if (strcmp(*argv, "handle") == 0) {
+ __u32 handle;
+ if (req.t.tcm_handle)
+ duparg("handle", *argv);
+ NEXT_ARG();
+ if (get_qdisc_handle(&handle, *argv))
+ invarg(*argv, "invalid qdisc ID");
+ req.t.tcm_handle = handle;
+ } else if (strcmp(*argv, "root") == 0) {
+ if (req.t.tcm_parent) {
+ fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
+ exit(-1);
+ }
+ req.t.tcm_parent = TC_H_ROOT;
+#ifdef TC_H_INGRESS
+ } else if (strcmp(*argv, "ingress") == 0) {
+ if (req.t.tcm_parent) {
+ fprintf(stderr, "Error: \"ingress\" is a duplicate parent ID\n");
+ exit(-1);
+ }
+ req.t.tcm_parent = TC_H_INGRESS;
+ strncpy(k, "ingress", sizeof(k)-1);
+ q = get_qdisc_kind(k);
+ req.t.tcm_handle = 0xffff0000;
+
+ argc--; argv++;
+ break;
+#endif
+ } else if (strcmp(*argv, "parent") == 0) {
+ __u32 handle;
+ NEXT_ARG();
+ if (req.t.tcm_parent)
+ duparg("parent", *argv);
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "invalid parent ID");
+ req.t.tcm_parent = handle;
+ } else if (matches(*argv, "estimator") == 0) {
+ if (parse_estimator(&argc, &argv, &est))
+ return -1;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ strncpy(k, *argv, sizeof(k)-1);
+
+ q = get_qdisc_kind(k);
+ argc--; argv++;
+ break;
+ }
+ argc--; argv++;
+ }
+
+ if (k[0])
+ addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+ if (est.ewma_log)
+ addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
+
+ if (q) {
+ if (q->parse_qopt(q, argc, argv, &req.n))
+ exit(1);
+ } else {
+ if (argc) {
+ if (matches(*argv, "help") == 0)
+ usage();
+
+ fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc qdisc help\".\n", *argv);
+ exit(-1);
+ }
+ }
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ if (d[0]) {
+ int idx;
+
+ ll_init_map(&rth);
+
+ if ((idx = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ exit(1);
+ }
+ req.t.tcm_ifindex = idx;
+ }
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+ exit(2);
+
+ rtnl_close(&rth);
+ return 0;
+}
+
+void print_tcstats(FILE *fp, struct tc_stats *st)
+{
+ SPRINT_BUF(b1);
+
+ fprintf(fp, " Sent %llu bytes %u pkts (dropped %u, overlimits %u) ",
+ (unsigned long long)st->bytes, st->packets, st->drops, st->overlimits);
+ if (st->bps || st->pps || st->qlen || st->backlog) {
+ fprintf(fp, "\n ");
+ if (st->bps || st->pps) {
+ fprintf(fp, "rate ");
+ if (st->bps)
+ fprintf(fp, "%s ", sprint_rate(st->bps, b1));
+ if (st->pps)
+ fprintf(fp, "%upps ", st->pps);
+ }
+ if (st->qlen || st->backlog) {
+ fprintf(fp, "backlog ");
+ if (st->backlog)
+ fprintf(fp, "%s ", sprint_size(st->backlog, b1));
+ if (st->qlen)
+ fprintf(fp, "%up ", st->qlen);
+ }
+ }
+}
+
+static int filter_ifindex;
+
+int print_qdisc(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+ FILE *fp = (FILE*)arg;
+ struct tcmsg *t = NLMSG_DATA(n);
+ int len = n->nlmsg_len;
+ struct rtattr * tb[TCA_MAX+1];
+ struct qdisc_util *q;
+ char abuf[256];
+
+ if (n->nlmsg_type != RTM_NEWQDISC && n->nlmsg_type != RTM_DELQDISC) {
+ fprintf(stderr, "Not a qdisc\n");
+ return 0;
+ }
+ len -= NLMSG_LENGTH(sizeof(*t));
+ if (len < 0) {
+ fprintf(stderr, "Wrong len %d\n", len);
+ return -1;
+ }
+
+ if (filter_ifindex && filter_ifindex != t->tcm_ifindex)
+ return 0;
+
+ memset(tb, 0, sizeof(tb));
+ parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len);
+
+ if (tb[TCA_KIND] == NULL) {
+ fprintf(stderr, "NULL kind\n");
+ return -1;
+ }
+
+ if (n->nlmsg_type == RTM_DELQDISC)
+ fprintf(fp, "deleted ");
+
+ fprintf(fp, "qdisc %s %x: ", (char*)RTA_DATA(tb[TCA_KIND]), t->tcm_handle>>16);
+ if (filter_ifindex == 0)
+ fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex));
+ if (t->tcm_parent == TC_H_ROOT)
+ fprintf(fp, "root ");
+ else if (t->tcm_parent) {
+ print_tc_classid(abuf, sizeof(abuf), t->tcm_parent);
+ fprintf(fp, "parent %s ", abuf);
+ }
+ if (t->tcm_info != 1) {
+ fprintf(fp, "refcnt %d ", t->tcm_info);
+ }
+ q = get_qdisc_kind(RTA_DATA(tb[TCA_KIND]));
+ if (tb[TCA_OPTIONS]) {
+ if (q)
+ q->print_qopt(q, fp, tb[TCA_OPTIONS]);
+ else
+ fprintf(fp, "[cannot parse qdisc parameters]");
+ }
+ fprintf(fp, "\n");
+ if (show_stats) {
+ if (tb[TCA_STATS]) {
+ if (RTA_PAYLOAD(tb[TCA_STATS]) < sizeof(struct tc_stats))
+ fprintf(fp, "statistics truncated");
+ else {
+ struct tc_stats st;
+ memcpy(&st, RTA_DATA(tb[TCA_STATS]), sizeof(st));
+ print_tcstats(fp, &st);
+ fprintf(fp, "\n");
+ }
+ }
+ if (q && tb[TCA_XSTATS]) {
+ q->print_xstats(q, fp, tb[TCA_XSTATS]);
+ fprintf(fp, "\n");
+ }
+ }
+ fflush(fp);
+ return 0;
+}
+
+
+int tc_qdisc_list(int argc, char **argv)
+{
+ struct tcmsg t;
+ struct rtnl_handle rth;
+ char d[16];
+
+ memset(&t, 0, sizeof(t));
+ t.tcm_family = AF_UNSPEC;
+ memset(&d, 0, sizeof(d));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ strncpy(d, *argv, sizeof(d)-1);
+#ifdef TC_H_INGRESS
+ } else if (strcmp(*argv, "ingress") == 0) {
+ if (t.tcm_parent) {
+ fprintf(stderr, "Duplicate parent ID\n");
+ usage();
+ }
+ t.tcm_parent = TC_H_INGRESS;
+#endif
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ fprintf(stderr, "What is \"%s\"? Try \"tc qdisc help\".\n", *argv);
+ return -1;
+ }
+
+ argc--; argv++;
+ }
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ ll_init_map(&rth);
+
+ if (d[0]) {
+ if ((t.tcm_ifindex = ll_name_to_index(d)) == 0) {
+ fprintf(stderr, "Cannot find device \"%s\"\n", d);
+ exit(1);
+ }
+ filter_ifindex = t.tcm_ifindex;
+ }
+
+ if (rtnl_dump_request(&rth, RTM_GETQDISC, &t, sizeof(t)) < 0) {
+ perror("Cannot send dump request");
+ exit(1);
+ }
+
+ if (rtnl_dump_filter(&rth, print_qdisc, stdout, NULL, NULL) < 0) {
+ fprintf(stderr, "Dump terminated\n");
+ exit(1);
+ }
+
+ rtnl_close(&rth);
+ return 0;
+}
+
+int do_qdisc(int argc, char **argv)
+{
+ if (argc < 1)
+ return tc_qdisc_list(0, NULL);
+ if (matches(*argv, "add") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
+ if (matches(*argv, "change") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, 0, argc-1, argv+1);
+ if (matches(*argv, "replace") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1);
+ if (matches(*argv, "link") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_REPLACE, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return tc_qdisc_modify(RTM_DELQDISC, 0, argc-1, argv+1);
+#if 0
+ if (matches(*argv, "get") == 0)
+ return tc_qdisc_get(RTM_GETQDISC, 0, argc-1, argv+1);
+#endif
+ if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
+ || matches(*argv, "lst") == 0)
+ return tc_qdisc_list(argc-1, argv+1);
+ if (matches(*argv, "help") == 0)
+ usage();
+ fprintf(stderr, "Command \"%s\" is unknown, try \"tc qdisc help\".\n", *argv);
+ return -1;
+}
diff --git a/tc/tc_red.c b/tc/tc_red.c
index e69de29b..385e7af1 100644
--- a/tc/tc_red.c
+++ b/tc/tc_red.c
@@ -0,0 +1,97 @@
+/*
+ * tc_red.c RED maintanance routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <math.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "tc_core.h"
+#include "tc_red.h"
+
+/*
+ Plog = log(prob/(qmax - qmin))
+ */
+int tc_red_eval_P(unsigned qmin, unsigned qmax, double prob)
+{
+ int i = qmax - qmin;
+
+ if (i <= 0)
+ return -1;
+
+ prob /= i;
+
+ for (i=0; i<32; i++) {
+ if (prob > 1.0)
+ break;
+ prob *= 2;
+ }
+ if (i>=32)
+ return -1;
+ return i;
+}
+
+/*
+ burst + 1 - qmin/avpkt < (1-(1-W)^burst)/W
+ */
+
+int tc_red_eval_ewma(unsigned qmin, unsigned burst, unsigned avpkt)
+{
+ int wlog = 1;
+ double W = 0.5;
+ double a = (double)burst + 1 - (double)qmin/avpkt;
+
+ if (a < 1.0)
+ return -1;
+ for (wlog=1; wlog<32; wlog++, W /= 2) {
+ if (a <= (1 - pow(1-W, burst))/W)
+ return wlog;
+ }
+ return -1;
+}
+
+/*
+ Stab[t>>Scell_log] = -log(1-W) * t/xmit_time
+ */
+
+int tc_red_eval_idle_damping(int Wlog, unsigned avpkt, unsigned bps, __u8 *sbuf)
+{
+ double xmit_time = tc_core_usec2tick(1000000*(double)avpkt/bps);
+ double lW = -log(1.0 - 1.0/(1<<Wlog))/xmit_time;
+ double maxtime = 31/lW;
+ int clog;
+ int i;
+ double tmp;
+
+ tmp = maxtime;
+ for (clog=0; clog<32; clog++) {
+ if (maxtime/(1<<clog) < 512)
+ break;
+ }
+ if (clog >= 32)
+ return -1;
+
+ sbuf[0] = 0;
+ for (i=1; i<255; i++) {
+ sbuf[i] = (i<<clog)*lW;
+ if (sbuf[i] > 31)
+ sbuf[i] = 31;
+ }
+ sbuf[255] = 31;
+ return clog;
+}
diff --git a/tc/tc_red.h b/tc/tc_red.h
index e69de29b..6f6b09e3 100644
--- a/tc/tc_red.h
+++ b/tc/tc_red.h
@@ -0,0 +1,8 @@
+#ifndef _TC_RED_H_
+#define _TC_RED_H_ 1
+
+extern int tc_red_eval_P(unsigned qmin, unsigned qmax, double prob);
+extern int tc_red_eval_ewma(unsigned qmin, unsigned burst, unsigned avpkt);
+extern int tc_red_eval_idle_damping(int wlog, unsigned avpkt, unsigned bandwidth, __u8 *sbuf);
+
+#endif
diff --git a/tc/tc_util.c b/tc/tc_util.c
index e69de29b..d1355391 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -0,0 +1,313 @@
+/*
+ * tc_util.c Misc TC utility functions.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <math.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+int get_qdisc_handle(__u32 *h, char *str)
+{
+ __u32 maj;
+ char *p;
+
+ maj = TC_H_UNSPEC;
+ if (strcmp(str, "none") == 0)
+ goto ok;
+ maj = strtoul(str, &p, 16);
+ if (p == str)
+ return -1;
+ maj <<= 16;
+ if (*p != ':' && *p!=0)
+ return -1;
+ok:
+ *h = maj;
+ return 0;
+}
+
+int get_tc_classid(__u32 *h, char *str)
+{
+ __u32 maj, min;
+ char *p;
+
+ maj = TC_H_ROOT;
+ if (strcmp(str, "root") == 0)
+ goto ok;
+ maj = TC_H_UNSPEC;
+ if (strcmp(str, "none") == 0)
+ goto ok;
+ maj = strtoul(str, &p, 16);
+ if (p == str) {
+ maj = 0;
+ if (*p != ':')
+ return -1;
+ }
+ if (*p == ':') {
+ maj <<= 16;
+ str = p+1;
+ min = strtoul(str, &p, 16);
+ if (*p != 0)
+ return -1;
+ maj |= min;
+ } else if (*p != 0)
+ return -1;
+
+ok:
+ *h = maj;
+ return 0;
+}
+
+int print_tc_classid(char *buf, int len, __u32 h)
+{
+ if (h == TC_H_ROOT)
+ sprintf(buf, "root");
+ else if (h == TC_H_UNSPEC)
+ snprintf(buf, len, "none");
+ else if (TC_H_MAJ(h) == 0)
+ snprintf(buf, len, ":%x", TC_H_MIN(h));
+ else if (TC_H_MIN(h) == 0)
+ snprintf(buf, len, "%x:", TC_H_MAJ(h)>>16);
+ else
+ snprintf(buf, len, "%x:%x", TC_H_MAJ(h)>>16, TC_H_MIN(h));
+ return 0;
+}
+
+char * sprint_tc_classid(__u32 h, char *buf)
+{
+ if (print_tc_classid(buf, SPRINT_BSIZE-1, h))
+ strcpy(buf, "???");
+ return buf;
+}
+
+
+int get_rate(unsigned *rate, char *str)
+{
+ char *p;
+ double bps = strtod(str, &p);
+
+ if (p == str)
+ return -1;
+
+ if (*p) {
+ if (strcasecmp(p, "kbps") == 0)
+ bps *= 1024;
+ else if (strcasecmp(p, "mbps") == 0)
+ bps *= 1024*1024;
+ else if (strcasecmp(p, "mbit") == 0)
+ bps *= 1024*1024/8;
+ else if (strcasecmp(p, "kbit") == 0)
+ bps *= 1024/8;
+ else if (strcasecmp(p, "bps") != 0)
+ return -1;
+ } else
+ bps /= 8;
+
+ *rate = bps;
+ return 0;
+}
+
+int get_rate_and_cell(unsigned *rate, int *cell_log, char *str)
+{
+ char * slash = strchr(str, '/');
+
+ if (slash)
+ *slash = 0;
+
+ if (get_rate(rate, str))
+ return -1;
+
+ if (slash) {
+ int cell;
+ int i;
+
+ if (get_integer(&cell, slash+1, 0))
+ return -1;
+ *slash = '/';
+
+ for (i=0; i<32; i++) {
+ if ((1<<i) == cell) {
+ *cell_log = i;
+ return 0;
+ }
+ }
+ return -1;
+ }
+ return 0;
+}
+
+
+int print_rate(char *buf, int len, __u32 rate)
+{
+ double tmp = (double)rate*8;
+
+ if (tmp >= 1024*1023 && fabs(1024*1024*rint(tmp/(1024*1024)) - tmp) < 1024)
+ snprintf(buf, len, "%gMbit", rint(tmp/(1024*1024)));
+ else if (tmp >= 1024-16 && fabs(1024*rint(tmp/1024) - tmp) < 16)
+ snprintf(buf, len, "%gKbit", rint(tmp/1024));
+ else
+ snprintf(buf, len, "%ubps", rate);
+ return 0;
+}
+
+char * sprint_rate(__u32 rate, char *buf)
+{
+ if (print_rate(buf, SPRINT_BSIZE-1, rate))
+ strcpy(buf, "???");
+ return buf;
+}
+
+int get_usecs(unsigned *usecs, char *str)
+{
+ double t;
+ char *p;
+
+ t = strtod(str, &p);
+ if (p == str)
+ return -1;
+
+ if (*p) {
+ if (strcasecmp(p, "s") == 0 || strcasecmp(p, "sec")==0 ||
+ strcasecmp(p, "secs")==0)
+ t *= 1000000;
+ else if (strcasecmp(p, "ms") == 0 || strcasecmp(p, "msec")==0 ||
+ strcasecmp(p, "msecs") == 0)
+ t *= 1000;
+ else if (strcasecmp(p, "us") == 0 || strcasecmp(p, "usec")==0 ||
+ strcasecmp(p, "usecs") == 0)
+ t *= 1;
+ else
+ return -1;
+ }
+
+ *usecs = t;
+ return 0;
+}
+
+
+int print_usecs(char *buf, int len, __u32 usec)
+{
+ double tmp = usec;
+
+ if (tmp >= 1000000)
+ snprintf(buf, len, "%.1fs", tmp/1000000);
+ else if (tmp >= 1000)
+ snprintf(buf, len, "%.1fms", tmp/1000);
+ else
+ snprintf(buf, len, "%uus", usec);
+ return 0;
+}
+
+char * sprint_usecs(__u32 usecs, char *buf)
+{
+ if (print_usecs(buf, SPRINT_BSIZE-1, usecs))
+ strcpy(buf, "???");
+ return buf;
+}
+
+int get_size(unsigned *size, char *str)
+{
+ double sz;
+ char *p;
+
+ sz = strtod(str, &p);
+ if (p == str)
+ return -1;
+
+ if (*p) {
+ if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k")==0)
+ sz *= 1024;
+ else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m")==0)
+ sz *= 1024*1024;
+ else if (strcasecmp(p, "mbit") == 0)
+ sz *= 1024*1024/8;
+ else if (strcasecmp(p, "kbit") == 0)
+ sz *= 1024/8;
+ else if (strcasecmp(p, "b") != 0)
+ return -1;
+ }
+
+ *size = sz;
+ return 0;
+}
+
+int get_size_and_cell(unsigned *size, int *cell_log, char *str)
+{
+ char * slash = strchr(str, '/');
+
+ if (slash)
+ *slash = 0;
+
+ if (get_size(size, str))
+ return -1;
+
+ if (slash) {
+ int cell;
+ int i;
+
+ if (get_integer(&cell, slash+1, 0))
+ return -1;
+ *slash = '/';
+
+ for (i=0; i<32; i++) {
+ if ((1<<i) == cell) {
+ *cell_log = i;
+ return 0;
+ }
+ }
+ return -1;
+ }
+ return 0;
+}
+
+int print_size(char *buf, int len, __u32 sz)
+{
+ double tmp = sz;
+
+ if (sz >= 1024*1024 && fabs(1024*1024*rint(tmp/(1024*1024)) - sz) < 1024)
+ snprintf(buf, len, "%gMb", rint(tmp/(1024*1024)));
+ else if (sz >= 1024 && fabs(1024*rint(tmp/1024) - sz) < 16)
+ snprintf(buf, len, "%gKb", rint(tmp/1024));
+ else
+ snprintf(buf, len, "%ub", sz);
+ return 0;
+}
+
+char * sprint_size(__u32 size, char *buf)
+{
+ if (print_size(buf, SPRINT_BSIZE-1, size))
+ strcpy(buf, "???");
+ return buf;
+}
+
+int print_qdisc_handle(char *buf, int len, __u32 h)
+{
+ snprintf(buf, len, "%x:", TC_H_MAJ(h)>>16);
+ return 0;
+}
+
+char * sprint_qdisc_handle(__u32 h, char *buf)
+{
+ if (print_qdisc_handle(buf, SPRINT_BSIZE-1, h))
+ strcpy(buf, "???");
+ return buf;
+}
+
+
diff --git a/tc/tc_util.h b/tc/tc_util.h
index e69de29b..bdc88d1f 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -0,0 +1,57 @@
+#ifndef _TC_UTIL_H_
+#define _TC_UTIL_H_ 1
+
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include "tc_core.h"
+
+struct qdisc_util
+{
+ struct qdisc_util *next;
+ char id[16];
+ int (*parse_qopt)(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n);
+ int (*print_qopt)(struct qdisc_util *qu, FILE *f, struct rtattr *opt);
+ int (*print_xstats)(struct qdisc_util *qu, FILE *f, struct rtattr *xstats);
+
+ int (*parse_copt)(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n);
+ int (*print_copt)(struct qdisc_util *qu, FILE *f, struct rtattr *opt);
+};
+
+struct filter_util
+{
+ struct filter_util *next;
+ char id[16];
+ int (*parse_fopt)(struct filter_util *qu, char *fhandle, int argc, char **argv, struct nlmsghdr *n);
+ int (*print_fopt)(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 fhandle);
+};
+
+
+extern struct qdisc_util *get_qdisc_kind(char *str);
+extern struct filter_util *get_filter_kind(char *str);
+
+extern int get_qdisc_handle(__u32 *h, char *str);
+extern int get_rate(unsigned *rate, char *str);
+extern int get_size(unsigned *size, char *str);
+extern int get_size_and_cell(unsigned *size, int *cell_log, char *str);
+extern int get_usecs(unsigned *usecs, char *str);
+extern int print_rate(char *buf, int len, __u32 rate);
+extern int print_size(char *buf, int len, __u32 size);
+extern int print_qdisc_handle(char *buf, int len, __u32 h);
+extern int print_usecs(char *buf, int len, __u32 usecs);
+extern char * sprint_rate(__u32 rate, char *buf);
+extern char * sprint_size(__u32 size, char *buf);
+extern char * sprint_qdisc_handle(__u32 h, char *buf);
+extern char * sprint_tc_classid(__u32 h, char *buf);
+extern char * sprint_usecs(__u32 usecs, char *buf);
+
+extern void print_tcstats(FILE *fp, struct tc_stats *st);
+
+extern int get_tc_classid(__u32 *h, char *str);
+extern int print_tc_classid(char *buf, int len, __u32 h);
+extern char * sprint_tc_classid(__u32 h, char *buf);
+
+extern int tc_print_police(FILE *f, struct rtattr *tb);
+extern int parse_police(int *, char ***, int, struct nlmsghdr *);
+
+
+#endif