diff options
author | Stephen Hemminger <sthemmin@microsoft.com> | 2017-10-18 17:11:50 -0700 |
---|---|---|
committer | Stephen Hemminger <sthemmin@microsoft.com> | 2017-10-18 17:11:50 -0700 |
commit | 4b4dde0ae6bed45192a1056c57ef87dc2a31d2c1 (patch) | |
tree | dbf39684e8d0fb942dc7350b172568778f1ed097 | |
parent | 70556c1632e6fdbc8489bf3c59e0588ece2e2f37 (diff) | |
parent | 4b73d52f8a81919f511cd47d39251f74f6a37c7d (diff) | |
download | platform_external_iproute2-4b4dde0ae6bed45192a1056c57ef87dc2a31d2c1.tar.gz platform_external_iproute2-4b4dde0ae6bed45192a1056c57ef87dc2a31d2c1.tar.bz2 platform_external_iproute2-4b4dde0ae6bed45192a1056c57ef87dc2a31d2c1.zip |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Makefile | 6 | ||||
-rwxr-xr-x | configure | 24 | ||||
-rw-r--r-- | doc/Makefile | 73 | ||||
-rw-r--r-- | doc/Plan | 16 | ||||
-rw-r--r-- | doc/SNAPSHOT.tex | 1 | ||||
-rw-r--r-- | doc/api-ip6-flowlabels.tex | 429 | ||||
-rw-r--r-- | doc/arpd.sgml | 130 | ||||
-rw-r--r-- | doc/do-psnup | 16 | ||||
-rw-r--r-- | doc/ip-cref.tex | 3453 | ||||
-rw-r--r-- | doc/ip-tunnels.tex | 469 | ||||
-rw-r--r-- | doc/nstat.sgml | 110 | ||||
-rw-r--r-- | doc/preamble.tex | 26 | ||||
-rw-r--r-- | doc/rtstat.sgml | 52 | ||||
-rw-r--r-- | doc/ss.sgml | 525 | ||||
-rw-r--r-- | doc/tc-filters.tex | 514 | ||||
-rw-r--r-- | include/color.h | 3 | ||||
-rw-r--r-- | include/json_print.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/atm.h (renamed from include/linux/atm.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/atmapi.h (renamed from include/linux/atmapi.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/atmarp.h (renamed from include/linux/atmarp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/atmdev.h (renamed from include/linux/atmdev.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/atmioc.h (renamed from include/linux/atmioc.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/atmsap.h (renamed from include/linux/atmsap.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h (renamed from include/linux/bpf.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/bpf_common.h (renamed from include/linux/bpf_common.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/can.h (renamed from include/linux/can.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/can/netlink.h (renamed from include/linux/can/netlink.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/can/vxcan.h (renamed from include/linux/can/vxcan.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/devlink.h (renamed from include/linux/devlink.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/elf-em.h (renamed from include/linux/elf-em.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/fib_rules.h (renamed from include/linux/fib_rules.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/filter.h (renamed from include/linux/filter.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/fou.h (renamed from include/linux/fou.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/gen_stats.h (renamed from include/linux/gen_stats.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/genetlink.h (renamed from include/linux/genetlink.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/hdlc/ioctl.h (renamed from include/linux/hdlc/ioctl.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/icmpv6.h (renamed from include/linux/icmpv6.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if.h (renamed from include/linux/if.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_addr.h (renamed from include/linux/if_addr.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_addrlabel.h (renamed from include/linux/if_addrlabel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_alg.h (renamed from include/linux/if_alg.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_arp.h (renamed from include/linux/if_arp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_bonding.h (renamed from include/linux/if_bonding.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_bridge.h (renamed from include/linux/if_bridge.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_ether.h (renamed from include/linux/if_ether.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h (renamed from include/linux/if_link.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_macsec.h (renamed from include/linux/if_macsec.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_packet.h (renamed from include/linux/if_packet.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_tun.h (renamed from include/linux/if_tun.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_tunnel.h (renamed from include/linux/if_tunnel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/if_vlan.h (renamed from include/linux/if_vlan.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/ife.h (renamed from include/linux/ife.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/ila.h (renamed from include/linux/ila.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/in.h (renamed from include/linux/in.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/in6.h (renamed from include/linux/in6.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/in_route.h (renamed from include/linux/in_route.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/inet_diag.h (renamed from include/linux/inet_diag.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/ip.h (renamed from include/linux/ip.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/ip6_tunnel.h (renamed from include/linux/ip6_tunnel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/ipsec.h (renamed from include/linux/ipsec.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/kernel.h (renamed from include/linux/kernel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/l2tp.h (renamed from include/linux/l2tp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/libc-compat.h (renamed from include/linux/libc-compat.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/limits.h (renamed from include/linux/limits.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/lwtunnel.h (renamed from include/linux/lwtunnel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/magic.h (renamed from include/linux/magic.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/mpls.h (renamed from include/linux/mpls.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/mpls_iptunnel.h (renamed from include/linux/mpls_iptunnel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/neighbour.h (renamed from include/linux/neighbour.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/net_namespace.h (renamed from include/linux/net_namespace.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netconf.h (renamed from include/linux/netconf.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netdevice.h (renamed from include/linux/netdevice.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter.h (renamed from include/linux/netfilter.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter/ipset/ip_set.h (renamed from include/linux/netfilter/ipset/ip_set.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter/x_tables.h (renamed from include/linux/netfilter/x_tables.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter/xt_set.h (renamed from include/linux/netfilter/xt_set.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter/xt_tcpudp.h (renamed from include/linux/netfilter/xt_tcpudp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter_ipv4.h (renamed from include/linux/netfilter_ipv4.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter_ipv4/ip_tables.h (renamed from include/linux/netfilter_ipv4/ip_tables.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter_ipv6.h (renamed from include/linux/netfilter_ipv6.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netfilter_ipv6/ip6_tables.h (renamed from include/linux/netfilter_ipv6/ip6_tables.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netlink.h (renamed from include/linux/netlink.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/netlink_diag.h (renamed from include/linux/netlink_diag.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/packet_diag.h (renamed from include/linux/packet_diag.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/param.h (renamed from include/linux/param.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/pfkeyv2.h (renamed from include/linux/pfkeyv2.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/pkt_cls.h (renamed from include/linux/pkt_cls.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/pkt_sched.h (renamed from include/linux/pkt_sched.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/posix_types.h (renamed from include/linux/posix_types.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/rtnetlink.h (renamed from include/linux/rtnetlink.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/sctp.h (renamed from include/linux/sctp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/seg6.h (renamed from include/linux/seg6.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/seg6_genl.h (renamed from include/linux/seg6_genl.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/seg6_hmac.h (renamed from include/linux/seg6_hmac.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/seg6_iptunnel.h (renamed from include/linux/seg6_iptunnel.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/seg6_local.h (renamed from include/linux/seg6_local.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/sock_diag.h (renamed from include/linux/sock_diag.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/socket.h (renamed from include/linux/socket.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/sockios.h (renamed from include/linux/sockios.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/stddef.h (renamed from include/linux/stddef.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/sysinfo.h (renamed from include/linux/sysinfo.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_bpf.h (renamed from include/linux/tc_act/tc_bpf.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_connmark.h (renamed from include/linux/tc_act/tc_connmark.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_csum.h (renamed from include/linux/tc_act/tc_csum.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_defact.h (renamed from include/linux/tc_act/tc_defact.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_gact.h (renamed from include/linux/tc_act/tc_gact.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_ife.h (renamed from include/linux/tc_act/tc_ife.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_ipt.h (renamed from include/linux/tc_act/tc_ipt.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_mirred.h (renamed from include/linux/tc_act/tc_mirred.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_nat.h (renamed from include/linux/tc_act/tc_nat.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_pedit.h (renamed from include/linux/tc_act/tc_pedit.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_sample.h (renamed from include/linux/tc_act/tc_sample.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_skbedit.h (renamed from include/linux/tc_act/tc_skbedit.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_skbmod.h (renamed from include/linux/tc_act/tc_skbmod.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_tunnel_key.h (renamed from include/linux/tc_act/tc_tunnel_key.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_act/tc_vlan.h (renamed from include/linux/tc_act/tc_vlan.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_ematch/tc_em_cmp.h (renamed from include/linux/tc_ematch/tc_em_cmp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_ematch/tc_em_meta.h (renamed from include/linux/tc_ematch/tc_em_meta.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tc_ematch/tc_em_nbyte.h (renamed from include/linux/tc_ematch/tc_em_nbyte.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tcp.h (renamed from include/linux/tcp.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tcp_metrics.h (renamed from include/linux/tcp_metrics.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tipc.h (renamed from include/linux/tipc.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/tipc_netlink.h (renamed from include/linux/tipc_netlink.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/types.h (renamed from include/linux/types.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/unix_diag.h (renamed from include/linux/unix_diag.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/veth.h (renamed from include/linux/veth.h) | 0 | ||||
-rw-r--r-- | include/uapi/linux/xfrm.h (renamed from include/linux/xfrm.h) | 0 | ||||
-rw-r--r-- | include/uapi/rdma/rdma_netlink.h (renamed from include/rdma/rdma_netlink.h) | 6 | ||||
-rw-r--r-- | include/utils.h | 4 | ||||
-rw-r--r-- | ip/ip6tunnel.c | 9 | ||||
-rw-r--r-- | ip/ipl2tp.c | 4 | ||||
-rw-r--r-- | ip/iplink.c | 31 | ||||
-rw-r--r-- | ip/ipmaddr.c | 3 | ||||
-rw-r--r-- | ip/iproute.c | 4 | ||||
-rw-r--r-- | ip/iprule.c | 10 | ||||
-rw-r--r-- | ip/iptunnel.c | 29 | ||||
-rw-r--r-- | ip/iptuntap.c | 6 | ||||
-rw-r--r-- | ip/xfrm_state.c | 2 | ||||
-rw-r--r-- | lib/color.c | 17 | ||||
-rw-r--r-- | lib/utils.c | 46 | ||||
-rw-r--r-- | misc/arpd.c | 3 | ||||
-rw-r--r-- | misc/ss.c | 21 | ||||
-rw-r--r-- | tc/f_flower.c | 7 | ||||
-rw-r--r-- | tc/f_u32.c | 3 | ||||
-rw-r--r-- | tc/q_netem.c | 12 | ||||
-rwxr-xr-x | testsuite/tests/ip/link/new_link.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/ip/link/show_dev_wo_vf_rate.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/ip/netns/set_nsid.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/ip/netns/set_nsid_batch.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/ip/route/add_default_route.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/ip/tunnel/add_tunnel.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/tc/cls-testbed.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/tc/dsmark.t | 2 | ||||
-rwxr-xr-x | testsuite/tests/tc/pedit.t | 2 | ||||
-rw-r--r-- | tipc/Makefile | 2 |
156 files changed, 188 insertions, 5900 deletions
@@ -1,3 +1,5 @@ +# locally generated +Config static-syms.h config.* *.o @@ -46,7 +46,7 @@ CCOPTS = -O2 WFLAGS := -Wall -Wstrict-prototypes -Wmissing-prototypes WFLAGS += -Wmissing-declarations -Wold-style-definition -Wformat=2 -CFLAGS := $(WFLAGS) $(CCOPTS) -I../include $(DEFINES) $(CFLAGS) +CFLAGS := $(WFLAGS) $(CCOPTS) -I../include -I../include/uapi $(DEFINES) $(CFLAGS) YACCFLAGS = -d -t -v SUBDIRS=lib ip tc bridge misc netem genl tipc devlink rdma man @@ -73,7 +73,7 @@ install: all $(DESTDIR)$(DOCDIR)/examples install -m 0644 $(shell find examples/diffserv -maxdepth 1 -type f) \ $(DESTDIR)$(DOCDIR)/examples/diffserv - @for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done + @for i in $(SUBDIRS); do $(MAKE) -C $$i install; done install -m 0644 $(shell find etc/iproute2 -maxdepth 1 -type f) $(DESTDIR)$(CONFDIR) install -m 0755 -d $(DESTDIR)$(BASH_COMPDIR) install -m 0644 bash-completion/tc $(DESTDIR)$(BASH_COMPDIR) @@ -84,7 +84,7 @@ snapshot: > include/SNAPSHOT.h clean: - @for i in $(SUBDIRS) doc; \ + @for i in $(SUBDIRS); \ do $(MAKE) $(MFLAGS) -C $$i clean; done clobber: @@ -326,6 +326,27 @@ EOF rm -f $TMPDIR/dbtest.c $TMPDIR/dbtest } +check_strlcpy() +{ + cat >$TMPDIR/strtest.c <<EOF +#include <string.h> +int main(int argc, char **argv) { + char dst[10]; + strlcpy(dst, "test", sizeof(dst)); + return 0; +} +EOF + $CC -I$INCLUDE -o $TMPDIR/strtest $TMPDIR/strtest.c >/dev/null 2>&1 + if [ $? -eq 0 ] + then + echo "no" + else + echo 'CFLAGS += -DNEED_STRLCPY' >>$CONFIG + echo "yes" + fi + rm -f $TMPDIR/strtest.c $TMPDIR/strtest +} + quiet_config() { cat <<EOF @@ -397,6 +418,9 @@ check_mnl echo -n "Berkeley DB: " check_berkeley_db +echo -n "need for strlcpy: " +check_strlcpy + echo echo -n "docs:" check_docs diff --git a/doc/Makefile b/doc/Makefile deleted file mode 100644 index 0c51872a..00000000 --- a/doc/Makefile +++ /dev/null @@ -1,73 +0,0 @@ -PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps tc-filters.ps -# tc-cref.ps -# api-rtnl.tex api-pmtudisc.tex api-news.tex -# iki-netdev.ps iki-neighdst.ps - - -LATEX=latex -DVIPS=dvips -SGML2DVI=sgml2latex -SGML2HTML=sgml2html -s 0 -LPR=lpr -Zsduplex -SHELL=bash -PAGESIZE=a4 -PAGESPERPAGE=2 - -HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml)) -DVIFILES=$(subst .ps,.dvi,$(PSFILES)) -PDFFILES=$(subst .ps,.pdf,$(PSFILES)) - - -all: pstwocol - -pstwocol: $(PSFILES) - -html: $(HTMLFILES) - -dvi: $(DVIFILES) - -pdf: $(PDFFILES) - -print: $(PSFILES) - $(LPR) $(PSFILES) - -%.tex: %.sgml - $(SGML2DVI) --output=tex $< - -%.dvi: %.sgml - $(SGML2DVI) --output=dvi $< - -%.dvi: %.tex - @set -e; pass=2; echo "Running LaTeX $<"; \ - while [ `$(LATEX) $< </dev/null 2>&1 | \ - grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \ - if [ $$pass -gt 3 ]; then \ - echo "Seems, something is wrong. Try by hands." ; exit 1 ; \ - fi; \ - echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \ - done - -%.pdf: %.tex - @set -e; pass=2; echo "Running pdfLaTeX $<"; \ - while [ `pdflatex $< </dev/null 2>&1 | \ - grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \ - if [ $$pass -gt 3 ]; then \ - echo "Seems, something is wrong. Try by hands." ; exit 1 ; \ - fi; \ - echo "Re-running pdfLaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \ - done -#%.pdf: %.ps -# ps2pdf $< - -%.ps: %.dvi - $(DVIPS) $< -o $@ - -%.html: %.sgml - $(SGML2HTML) $< - -install: - install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR) - install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR) - -clean: - rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html *.pdf diff --git a/doc/Plan b/doc/Plan deleted file mode 100644 index 55f478ea..00000000 --- a/doc/Plan +++ /dev/null @@ -1,16 +0,0 @@ -Partially finished work. - -1. User Reference manuals. -1.1 IP Command reference (ip-cref.tex, published) -1.2 TC Command reference (tc-cref.tex) -1.3 IP tunnels (ip-tunnels.tex, published) - -2. Linux-2.2 Networking API -2.1 RTNETLINK (api-rtnl.tex) -2.2 Path MTU Discovery (api-pmtudisc.tex) -2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published) -2.4 Miscellaneous extensions (api-misc.tex) - -3. Linux-2.2 Networking Intra-Kernel Interfaces -3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex) -3.2 Neighbour cache and destination cache. (iki-neighdst.tex) diff --git a/doc/SNAPSHOT.tex b/doc/SNAPSHOT.tex deleted file mode 100644 index 7ed02984..00000000 --- a/doc/SNAPSHOT.tex +++ /dev/null @@ -1 +0,0 @@ -\def\Draft{020116} diff --git a/doc/api-ip6-flowlabels.tex b/doc/api-ip6-flowlabels.tex deleted file mode 100644 index aa34e947..00000000 --- a/doc/api-ip6-flowlabels.tex +++ /dev/null @@ -1,429 +0,0 @@ -\documentstyle[12pt,twoside]{article} -\def\TITLE{IPv6 Flow Labels} -\input preamble -\begin{center} -\Large\bf IPv6 Flow Labels in Linux-2.2. -\end{center} - - -\begin{center} -{ \large Alexey~N.~Kuznetsov } \\ -\em Institute for Nuclear Research, Moscow \\ -\verb|kuznet@ms2.inr.ac.ru| \\ -\rm April 11, 1999 -\end{center} - -\vspace{5mm} - -\tableofcontents - -\section{Introduction.} - -Every IPv6 packet carries 28 bits of flow information. RFC2460 splits -these bits to two fields: 8 bits of traffic class (or DS field, if you -prefer this term) and 20 bits of flow label. Currently there exist -no well-defined API to manage IPv6 flow information. In this document -I describe an attempt to design the API for Linux-2.2 IPv6 stack. - -\vskip 1mm - -The API must solve the following tasks: - -\begin{enumerate} - -\item To allow user to set traffic class bits. - -\item To allow user to read traffic class bits of received packets. -This feature is not so useful as the first one, however it will be -necessary f.e.\ to implement ECN [RFC2481] for datagram oriented services -or to implement receiver side of SRP or another end-to-end protocol -using traffic class bits. - -\item To assign flow labels to packets sent by user. - -\item To get flow labels of received packets. I do not know -any applications of this feature, but it is possible that receiver will -want to use flow labels to distinguish sub-flows. - -\item To allocate flow labels in the way, compliant to RFC2460. Namely: - -\begin{itemize} -\item -Flow labels must be uniformly distributed (pseudo-)random numbers, -so that any subset of 20 bits can be used as hash key. - -\item -Flows with coinciding source address and flow label must have identical -destination address and not-fragmentable extensions headers (i.e.\ -hop by hop options and all the headers up to and including routing header, -if it is present.) - -\begin{NB} -There is a hole in specs: some hop-by-hop options can be -defined only on per-packet base (f.e.\ jumbo payload option). -Essentially, it means that such options cannot present in packets -with flow labels. -\end{NB} -\begin{NB} -NB notes here and below reflect only my personal opinion, -they should be read with smile or should not be read at all :-). -\end{NB} - - -\item -Flow labels have finite lifetime and source is not allowed to reuse -flow label for another flow within the maximal lifetime has expired, -so that intermediate nodes will be able to invalidate flow state before -the label is taken over by another flow. -Flow state, including lifetime, is propagated along datagram path -by some application specific methods -(f.e.\ in RSVP PATH messages or in some hop-by-hop option). - - -\end{itemize} - -\end{enumerate} - -\section{Sending/receiving flow information.} - -\paragraph{Discussion.} -\addcontentsline{toc}{subsection}{Discussion} -It was proposed (Where? I do not remember any explicit statement) -to solve the first four tasks using -\verb|sin6_flowinfo| field added to \verb|struct| \verb|sockaddr_in6| -(see RFC2553). - -\begin{NB} - This method is difficult to consider as reasonable, because it - puts additional overhead to all the services, despite of only - very small subset of them (none, to be more exact) really use it. - It contradicts both to IETF spirit and the letter. Before RFC2553 - one justification existed, IPv6 address alignment left 4 byte - hole in \verb|sockaddr_in6| in any case. Now it has no justification. -\end{NB} - -We have two problems with this method. The first one is common for all OSes: -if \verb|recvmsg()| initializes \verb|sin6_flowinfo| to flow info -of received packet, we loose one very important property of BSD socket API, -namely, we are not allowed to use received address for reply directly -and have to mangle it, even if we are not interested in flowinfo subtleties. - -\begin{NB} - RFC2553 adds new requirement: to clear \verb|sin6_flowinfo|. - Certainly, it is not solution but rather attempt to force applications - to make unnecessary work. Well, as usually, one mistake in design - is followed by attempts to patch the hole and more mistakes... -\end{NB} - -Another problem is Linux specific. Historically Linux IPv6 did not -initialize \verb|sin6_flowinfo| at all, so that, if kernel does not -support flow labels, this field is not zero, but a random number. -Some applications also did not take care about it. - -\begin{NB} -Following RFC2553 such applications can be considered as broken, -but I still think that they are right: clearing all the address -before filling known fields is robust but stupid solution. -Useless wasting CPU cycles and -memory bandwidth is not a good idea. Such patches are acceptable -as temporary hacks, but not as standard of the future. -\end{NB} - - -\paragraph{Implementation.} -\addcontentsline{toc}{subsection}{Implementation} -By default Linux IPv6 does not read \verb|sin6_flowinfo| field -assuming that common applications are not obliged to initialize it -and are permitted to consider it as pure alignment padding. -In order to tell kernel that application -is aware of this field, it is necessary to set socket option -\verb|IPV6_FLOWINFO_SEND|. - -\begin{verbatim} - int on = 1; - setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO_SEND, - (void*)&on, sizeof(on)); -\end{verbatim} - -Linux kernel never fills \verb|sin6_flowinfo| field, when passing -message to user space, though the kernels which support flow labels -initialize it to zero. If user wants to get received flowinfo, he -will set option \verb|IPV6_FLOWINFO| and after this he will receive -flowinfo as ancillary data object of type \verb|IPV6_FLOWINFO| -(cf.\ RFC2292). - -\begin{verbatim} - int on = 1; - setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO, (void*)&on, sizeof(on)); -\end{verbatim} - -Flowinfo received and latched by a connected TCP socket also may be fetched -with \verb|getsockopt()| \verb|IPV6_PKTOPTIONS| together with -another optional information. - -Besides that, in the spirit of RFC2292 the option \verb|IPV6_FLOWINFO| -may be used as alternative way to send flowinfo with \verb|sendmsg()| or -to latch it with \verb|IPV6_PKTOPTIONS|. - -\paragraph{Note about IPv6 options and destination address.} -\addcontentsline{toc}{subsection}{IPv6 options and destination address} -If \verb|sin6_flowinfo| does contain not zero flow label, -destination address in \verb|sin6_addr| and non-fragmentable -extension headers are ignored. Instead, kernel uses the values -cached at flow setup (see below). However, for connected sockets -kernel prefers the values set at connection time. - -\paragraph{Example.} -\addcontentsline{toc}{subsection}{Example} -After setting socket option \verb|IPV6_FLOWINFO| -flowlabel and DS field are received as ancillary data object -of type \verb|IPV6_FLOWINFO| and level \verb|SOL_IPV6|. -In the cases when it is convenient to use \verb|recvfrom(2)|, -it is possible to replace library variant with your own one, -sort of: - -\begin{verbatim} -#include <sys/socket.h> -#include <netinet/in6.h> - -size_t recvfrom(int fd, char *buf, size_t len, int flags, - struct sockaddr *addr, int *addrlen) -{ - size_t cc; - char cbuf[128]; - struct cmsghdr *c; - struct iovec iov = { buf, len }; - struct msghdr msg = { addr, *addrlen, - &iov, 1, - cbuf, sizeof(cbuf), - 0 }; - - cc = recvmsg(fd, &msg, flags); - if (cc < 0) - return cc; - ((struct sockaddr_in6*)addr)->sin6_flowinfo = 0; - *addrlen = msg.msg_namelen; - for (c=CMSG_FIRSTHDR(&msg); c; c = CMSG_NEXTHDR(&msg, c)) { - if (c->cmsg_level != SOL_IPV6 || - c->cmsg_type != IPV6_FLOWINFO) - continue; - ((struct sockaddr_in6*)addr)->sin6_flowinfo = *(__u32*)CMSG_DATA(c); - } - return cc; -} -\end{verbatim} - - - -\section{Flow label management.} - -\paragraph{Discussion.} -\addcontentsline{toc}{subsection}{Discussion} -Requirements of RFC2460 are pretty tough. Particularly, lifetimes -longer than boot time require to store allocated labels at stable -storage, so that the full implementation necessarily includes user space flow -label manager. There are at least three different approaches: - -\begin{enumerate} -\item {\bf ``Cooperative''. } We could leave flow label allocation wholly -to user space. When user needs label he requests manager directly. The approach -is valid, but as any ``cooperative'' approach it suffers of security problems. - -\begin{NB} -One idea is to disallow not privileged user to allocate flow -labels, but instead to pass the socket to manager via \verb|SCM_RIGHTS| -control message, so that it will allocate label and assign it to socket -itself. Hmm... the idea is interesting. -\end{NB} - -\item {\bf ``Indirect''.} Kernel redirects requests to user level daemon -and does not install label until the daemon acknowledged the request. -The approach is the most promising, it is especially pleasant to recognize -parallel with IPsec API [RFC2367,Craig]. Actually, it may share API with -IPsec. - -\item {\bf ``Stupid''.} To allocate labels in kernel space. It is the simplest -method, but it suffers of two serious flaws: the first, -we cannot lease labels with lifetimes longer than boot time, the second, -it is sensitive to DoS attacks. Kernel have to remember all the obsolete -labels until their expiration and malicious user may fastly eat all the -flow label space. - -\end{enumerate} - -Certainly, I choose the most ``stupid'' method. It is the cheapest one -for implementor (i.e.\ me), and taking into account that flow labels -still have no serious applications it is not useful to work on more -advanced API, especially, taking into account that eventually we -will get it for no fee together with IPsec. - - -\paragraph{Implementation.} -\addcontentsline{toc}{subsection}{Implementation} -Socket option \verb|IPV6_FLOWLABEL_MGR| allows to -request flow label manager to allocate new flow label, to reuse -already allocated one or to delete old flow label. -Its argument is \verb|struct| \verb|in6_flowlabel_req|: - -\begin{verbatim} -struct in6_flowlabel_req -{ - struct in6_addr flr_dst; - __u32 flr_label; - __u8 flr_action; - __u8 flr_share; - __u16 flr_flags; - __u16 flr_expires; - __u16 flr_linger; - __u32 __flr_reserved; - /* Options in format of IPV6_PKTOPTIONS */ -}; -\end{verbatim} - -\begin{itemize} - -\item \verb|dst| is IPv6 destination address associated with the label. - -\item \verb|label| is flow label value in network byte order. If it is zero, -kernel will allocate new pseudo-random number. Otherwise, kernel will try -to lease flow label ordered by user. In this case, it is user task to provide -necessary flow label randomness. - -\item \verb|action| is requested operation. Currently, only three operations -are defined: - -\begin{verbatim} -#define IPV6_FL_A_GET 0 /* Get flow label */ -#define IPV6_FL_A_PUT 1 /* Release flow label */ -#define IPV6_FL_A_RENEW 2 /* Update expire time */ -\end{verbatim} - -\item \verb|flags| are optional modifiers. Currently -only \verb|IPV6_FL_A_GET| has modifiers: - -\begin{verbatim} -#define IPV6_FL_F_CREATE 1 /* Allowed to create new label */ -#define IPV6_FL_F_EXCL 2 /* Do not create new label */ -\end{verbatim} - - -\item \verb|share| defines who is allowed to reuse the same flow label. - -\begin{verbatim} -#define IPV6_FL_S_NONE 0 /* Not defined */ -#define IPV6_FL_S_EXCL 1 /* Label is private */ -#define IPV6_FL_S_PROCESS 2 /* May be reused by this process */ -#define IPV6_FL_S_USER 3 /* May be reused by this user */ -#define IPV6_FL_S_ANY 255 /* Anyone may reuse it */ -\end{verbatim} - -\item \verb|linger| is time in seconds. After the last user releases flow -label, it will not be reused with different destination and options at least -during this time. If \verb|share| is not \verb|IPV6_FL_S_EXCL| the label -still can be shared by another sockets. Current implementation does not allow -unprivileged user to set linger longer than 60 sec. - -\item \verb|expires| is time in seconds. Flow label will be kept at least -for this time, but it will not be destroyed before user released it explicitly -or closed all the sockets using it. Current implementation does not allow -unprivileged user to set timeout longer than 60 sec. Proviledged applications -MAY set longer lifetimes, but in this case they MUST save allocated -labels at stable storage and restore them back after reboot before the first -application allocates new flow. - -\end{itemize} - -This structure is followed by optional extension headers associated -with this flow label in format of \verb|IPV6_PKTOPTIONS|. Only -\verb|IPV6_HOPOPTS|, \verb|IPV6_RTHDR| and, if \verb|IPV6_RTHDR| presents, -\verb|IPV6_DSTOPTS| are allowed. - -\paragraph{Example.} -\addcontentsline{toc}{subsection}{Example} - The function \verb|get_flow_label| allocates -private flow label. - -\begin{verbatim} -int get_flow_label(int fd, struct sockaddr_in6 *dst, __u32 fl) -{ - int on = 1; - struct in6_flowlabel_req freq; - - memset(&freq, 0, sizeof(freq)); - freq.flr_label = htonl(fl); - freq.flr_action = IPV6_FL_A_GET; - freq.flr_flags = IPV6_FL_F_CREATE | IPV6_FL_F_EXCL; - freq.flr_share = IPV6_FL_S_EXCL; - memcpy(&freq.flr_dst, &dst->sin6_addr, 16); - if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, - &freq, sizeof(freq)) == -1) { - perror ("can't lease flowlabel"); - return -1; - } - dst->sin6_flowinfo |= freq.flr_label; - - if (setsockopt(fd, SOL_IPV6, IPV6_FLOWINFO_SEND, - &on, sizeof(on)) == -1) { - perror ("can't send flowinfo"); - - freq.flr_action = IPV6_FL_A_PUT; - setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, - &freq, sizeof(freq)); - return -1; - } - return 0; -} -\end{verbatim} - -A bit more complicated example using routing header can be found -in \verb|ping6| utility (\verb|iputils| package). Linux rsvpd backend -contains an example of using operation \verb|IPV6_FL_A_RENEW|. - -\paragraph{Listing flow labels.} -\addcontentsline{toc}{subsection}{Listing flow labels} -List of currently allocated -flow labels may be read from \verb|/proc/net/ip6_flowlabel|. - -\begin{verbatim} -Label S Owner Users Linger Expires Dst Opt -A1BE5 1 0 0 6 3 3ffe2400000000010a0020fffe71fb30 0 -\end{verbatim} - -\begin{itemize} -\item \verb|Label| is hexadecimal flow label value. -\item \verb|S| is sharing style. -\item \verb|Owner| is ID of creator, it is zero, pid or uid, depending on - sharing style. -\item \verb|Users| is number of applications using the label now. -\item \verb|Linger| is \verb|linger| of this label in seconds. -\item \verb|Expires| is time until expiration of the label in seconds. It may - be negative, if the label is in use. -\item \verb|Dst| is IPv6 destination address. -\item \verb|Opt| is length of options, associated with the label. Option - data are not accessible. -\end{itemize} - - -\paragraph{Flow labels and RSVP.} -\addcontentsline{toc}{subsection}{Flow labels and RSVP} -RSVP daemon supports IPv6 flow labels -without any modifications to standard ISI RAPI. Sender must allocate -flow label, fill corresponding sender template and submit it to local rsvp -daemon. rsvpd will check the label and start to announce it in PATH -messages. Rsvpd on sender node will renew the flow label, so that it will not -be reused before path state expires and all the intermediate -routers and receiver purge flow state. - -\verb|rtap| utility is modified to parse flow labels. F.e.\ if user allocated -flow label \verb|0xA1234|, he may write: - -\begin{verbatim} -RTAP> sender 3ffe:2400::1/FL0xA1234 <Tspec> -\end{verbatim} - -Receiver makes reservation with command: -\begin{verbatim} -RTAP> reserve ff 3ffe:2400::1/FL0xA1234 <Flowspec> -\end{verbatim} - -\end{document} diff --git a/doc/arpd.sgml b/doc/arpd.sgml deleted file mode 100644 index 0ab79c60..00000000 --- a/doc/arpd.sgml +++ /dev/null @@ -1,130 +0,0 @@ -<!doctype linuxdoc system> - -<article> - -<title>ARPD Daemon -<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/ -<date>some_negative_number, 20 Sep 2001 -<abstract> -<tt/arpd/ is daemon collecting gratuitous ARP information, saving -it on local disk and feeding it to kernel on demand to avoid -redundant broadcasting due to limited size of kernel ARP cache. -</abstract> - - -<p><bf/Description/ - -<p>The format of the command is: - -<tscreen><verb> - arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ] -</verb></tscreen> - -<p> <tt/OPTIONS/ are: - -<itemize> - -<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists -of three columns: interface index, IP address and MAC address. -Negative entries for dead hosts are also shown, in this case MAC address -is replaced by word <tt/FAILED/ followed by colon and time when the fact -that host is dead was proven the last time. - -<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/ -in text format similar dumped by option <tt/-l/. Exit after load, -probably listing resulting database, if option <tt/-l/ is also given. -If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table. - -<item><tt/-b DATABASE/ - location of database file. Default location is -<tt>/var/lib/arpd/arpd.db</tt>. - -<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but -also send brodcast queries itself. <tt/NUMBER/ is number of such queries -to make before destination is considered as dead. When <tt/arpd/ is started -as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/ -or even with option <tt/-k/) without this option and still did not learn enough -information, you can observe 1 second gaps in service. Not fatal, but -not good. - -<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes -sense together with option <tt/-a/. - -<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/ -suppresses further attempts to resolve for this period. It makes sense -only together with option <tt/-k/. This timeout should not be too much -longer than boot time of a typical host not supporting gratuitous ARP. -Default value is 60 seconds. - -<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/ -in packets per second. Default value is 1. - -<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back. -Default value is 3. Together with option <tt/-R/ this option allows -to police broadcasting not to exceed <tt/B+R*T/ over any interval -of time <tt/T/. - -</itemize> - -<p><tt/INTERFACE/ is name of networking inteface to watch. -If no interfaces given, <tt/arpd/ monitors all the interfaces. -In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters, -it is supposed user does this himself after <tt/arpd/ is started. - - -<p> Signals - -<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted -<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/. -<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics -to <tt/syslog/. Effect of another signals is undefined, they may corrupt -database and leave <tt/sysctl/ parameters in an unpredictable state. - -<p> Note - -<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be -compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list -is not given on command line, variable <tt/app_solicit/ -on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>. -If this is not made <tt/arpd/ still collects gratuitous ARP information -in its database. - -<p> Examples - -<enum> -<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing -with kernel functionality: - -<tscreen><verb> - arpd -b /var/tmp/arpd.db -</verb></tscreen> - -<item> Look at result after some time: - -<tscreen><verb> - killall arpd - arpd -l -b /var/tmp/arpd.db -</verb></tscreen> - -<item> To enable kernel helper, leaving leading role to kernel: - -<tscreen><verb> - arpd -b /var/tmp/arpd.db -a 1 eth0 eth1 -</verb></tscreen> - -<item> Completely replace kernel resolution on interfaces <tt/eth0/ -and <tt/eth1/. In this case kernel still does unicast probing to -validate entries, but all the broadcast activity is suppressed -and made under authority of <tt/arpd/: - -<tscreen><verb> - arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1 -</verb></tscreen> - -This is mode which <tt/arpd/ is supposed to work normally. -It is not default just to prevent occasional enabling of too aggressive -mode occasionally. - -</enum> - -</article> - diff --git a/doc/do-psnup b/doc/do-psnup deleted file mode 100644 index 2dce848e..00000000 --- a/doc/do-psnup +++ /dev/null @@ -1,16 +0,0 @@ -#! /bin/bash -# $1 = Temporary file . "string" -# $2 = File to process . "string" -# $3 = Page size . ie: a4 , letter ... "string" -# $4 = Number of pages to fit on a single sheet . "numeric" - -if type psnup >&/dev/null; then - echo "psnup -$4 -p$3 $1 $2" - psnup -$4 -p$3 $1 $2 -elif type psmulti >&/dev/null; then - echo "psmulti $1 > $2" - psmulti $1 > $2 -else - echo "cp $1 $2" - cp $1 $2 -fi diff --git a/doc/ip-cref.tex b/doc/ip-cref.tex deleted file mode 100644 index 179baa2f..00000000 --- a/doc/ip-cref.tex +++ /dev/null @@ -1,3453 +0,0 @@ -\documentstyle[12pt,twoside]{article} -\def\TITLE{IP Command Reference} -\input preamble -\begin{center} -\Large\bf IP Command Reference. -\end{center} - - -\begin{center} -{ \large Alexey~N.~Kuznetsov } \\ -\em Institute for Nuclear Research, Moscow \\ -\verb|kuznet@ms2.inr.ac.ru| \\ -\rm April 14, 1999 -\end{center} - -\vspace{5mm} - -\tableofcontents - -\newpage - -\section{About this document} - -This document presents a comprehensive description of the \verb|ip| utility -from the \verb|iproute2| package. It is not a tutorial or user's guide. -It is a {\em dictionary\/}, not explaining terms, -but translating them into other terms, which may also be unknown to the reader. -However, the document is self-contained and the reader, provided they have a -basic networking background, will find enough information -and examples to understand and configure Linux-2.2 IP and IPv6 -networking. - -This document is split into sections explaining \verb|ip| commands -and options, decrypting \verb|ip| output and containing a few examples. -More voluminous examples and some topics, which require more elaborate -discussion, are in the appendix. - -The paragraphs beginning with NB contain side notes, warnings about -bugs and design drawbacks. They may be skipped at the first reading. - -\section{{\tt ip} --- command syntax} - -The generic form of an \verb|ip| command is: -\begin{verbatim} -ip [ OPTIONS ] OBJECT [ COMMAND [ ARGUMENTS ]] -\end{verbatim} -where \verb|OPTIONS| is a set of optional modifiers affecting the -general behaviour of the \verb|ip| utility or changing its output. All options -begin with the character \verb|'-'| and may be used in either long or abbreviated -forms. Currently, the following options are available: - -\begin{itemize} -\item \verb|-V|, \verb|-Version| - ---- print the version of the \verb|ip| utility and exit. - - -\item \verb|-s|, \verb|-stats|, \verb|-statistics| - ---- output more information. If the option -appears twice or more, the amount of information increases. -As a rule, the information is statistics or some time values. - -\item \verb|-d|, \verb|-details| - ---- output more detailed information. - -\item \verb|-f|, \verb|-family| followed by a protocol family -identifier: \verb|inet|, \verb|inet6| or \verb|link|. - ---- enforce the protocol family to use. If the option is not present, -the protocol family is guessed from other arguments. If the rest of the command -line does not give enough information to guess the family, \verb|ip| falls back to the default -one, usually \verb|inet| or \verb|any|. \verb|link| is a special family -identifier meaning that no networking protocol is involved. - -\item \verb|-4| - ---- shortcut for \verb|-family inet|. - -\item \verb|-6| - ---- shortcut for \verb|-family inet6|. - -\item \verb|-0| - ---- shortcut for \verb|-family link|. - - -\item \verb|-o|, \verb|-oneline| - ---- output each record on a single line, replacing line feeds -with the \verb|'\'| character. This is convenient when you want to -count records with \verb|wc| or to \verb|grep| the output. The trivial -script \verb|rtpr| converts the output back into readable form. - -\item \verb|-r|, \verb|-resolve| - ---- use the system's name resolver to print DNS names instead of -host addresses. - -\begin{NB} - Do not use this option when reporting bugs or asking for advice. -\end{NB} -\begin{NB} - \verb|ip| never uses DNS to resolve names to addresses. -\end{NB} - -\item \verb|-b|, \verb|-batch FILE| - ---- read commands from provided file or standart input and invoke them. -First failure will cause termination of \verb|ip|. -In batch \verb|FILE| everything which begins with \verb|#| symbol is -ignored and can be used for comments. -\paragraph{Example:} -\begin{verbatim} -kuznet@kaiser $ cat /tmp/ip_batch.ip -# This is a comment -tuntap add mode tap tap1 # This is an another comment -link set up dev tap1 -addr add 10.0.0.1/24 dev tap1 -kuznet@kaiser $ sudo ip -b /tmp/ip_batch.ip -\end{verbatim} -or from standart input: -\begin{verbatim} -kuznet@kaiser $ cat /tmp/ip_batch.ip | sudo ip -b - -\end{verbatim} - -\item \verb|-force| - ---- don't terminate ip on errors in batch mode. -If there were any errors during execution of the commands, -the application return code will be non zero. - -\item \verb|-l|, \verb|-loops COUNT| - ---- specify maximum number of loops the 'ip addr flush' logic will attempt -before giving up. The default is 10. Zero (0) means loop until all -addresses are removed. - -\end{itemize} - -\verb|OBJECT| is the object to manage or to get information about. -The object types currently understood by \verb|ip| are: - -\begin{itemize} -\item \verb|link| --- network device -\item \verb|address| --- protocol (IP or IPv6) address on a device -\item \verb|neighbour| --- ARP or NDISC cache entry -\item \verb|route| --- routing table entry -\item \verb|rule| --- rule in routing policy database -\item \verb|maddress| --- multicast address -\item \verb|mroute| --- multicast routing cache entry -\item \verb|tunnel| --- tunnel over IP -\end{itemize} - -Again, the names of all objects may be written in full or -abbreviated form, f.e.\ \verb|address| is abbreviated as \verb|addr| -or just \verb|a|. - -\verb|COMMAND| specifies the action to perform on the object. -The set of possible actions depends on the object type. -As a rule, it is possible to \verb|add|, \verb|delete| and -\verb|show| (or \verb|list|) objects, but some objects -do not allow all of these operations or have some additional commands. -The \verb|help| command is available for all objects. It prints -out a list of available commands and argument syntax conventions. - -If no command is given, some default command is assumed. -Usually it is \verb|list| or, if the objects of this class -cannot be listed, \verb|help|. - -\verb|ARGUMENTS| is a list of arguments to the command. -The arguments depend on the command and object. There are two types of arguments: -{\em flags\/}, consisting of a single keyword, and {\em parameters\/}, -consisting of a keyword followed by a value. For convenience, -each command has some {\em default parameter\/} -which may be omitted. F.e.\ parameter \verb|dev| is the default -for the {\tt ip link} command, so {\tt ip link ls eth0} is equivalent -to {\tt ip link ls dev eth0}. -In the command descriptions below such parameters -are distinguished with the marker: ``(default)''. - -Almost all keywords may be abbreviated with several first (or even single) -letters. The shortcuts are convenient when \verb|ip| is used interactively, -but they are not recommended in scripts or when reporting bugs -or asking for advice. ``Officially'' allowed abbreviations are listed -in the document body. - - - -\section{{\tt ip} --- error messages} - -\verb|ip| may fail for one of the following reasons: - -\begin{itemize} -\item -A syntax error on the command line: an unknown keyword, incorrectly formatted -IP address {\em et al\/}. In this case \verb|ip| prints an error message -and exits. As a rule, the error message will contain information -about the reason for the failure. Sometimes it also prints a help page. - -\item -The arguments did not pass verification for self-consistency. - -\item -\verb|ip| failed to compile a kernel request from the arguments -because the user didn't give enough information. - -\item -The kernel returned an error to some syscall. In this case \verb|ip| -prints the error message, as it is output with \verb|perror(3)|, -prefixed with a comment and a syscall identifier. - -\item -The kernel returned an error to some RTNETLINK request. -In this case \verb|ip| prints the error message, as it is output -with \verb|perror(3)| prefixed with ``RTNETLINK answers:''. - -\end{itemize} - -All the operations are atomic, i.e.\ -if the \verb|ip| utility fails, it does not change anything -in the system. One harmful exception is \verb|ip link| command -(Sec.\ref{IP-LINK}, p.\pageref{IP-LINK}), -which may change only some of the device parameters given -on command line. - -It is difficult to list all the error messages (especially -syntax errors). However, as a rule, their meaning is clear -from the context of the command. - -The most common mistakes are: - -\begin{enumerate} -\item Netlink is not configured in the kernel. The message is: -\begin{verbatim} -Cannot open netlink socket: Invalid value -\end{verbatim} - -\item RTNETLINK is not configured in the kernel. In this case -one of the following messages may be printed, depending on the command: -\begin{verbatim} -Cannot talk to rtnetlink: Connection refused -Cannot send dump request: Connection refused -\end{verbatim} - -\item The \verb|CONFIG_IP_MULTIPLE_TABLES| option was not selected -when configuring the kernel. In this case any attempt to use the -\verb|ip| \verb|rule| command will fail, f.e. -\begin{verbatim} -kuznet@kaiser $ ip rule list -RTNETLINK error: Invalid argument -dump terminated -\end{verbatim} - -\end{enumerate} - - -\section{{\tt ip link} --- network device configuration} -\label{IP-LINK} - -\paragraph{Object:} A \verb|link| is a network device and the corresponding -commands display and change the state of devices. - -\paragraph{Commands:} \verb|set| and \verb|show| (or \verb|list|). - -\subsection{{\tt ip link set} --- change device attributes} - -\paragraph{Abbreviations:} \verb|set|, \verb|s|. - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|dev NAME| (default) - ---- \verb|NAME| specifies the network device on which to operate. - -\item \verb|up| and \verb|down| - ---- change the state of the device to \verb|UP| or \verb|DOWN|. - -\item \verb|arp on| or \verb|arp off| - ---- change the \verb|NOARP| flag on the device. - -\begin{NB} -This operation is {\em not allowed\/} if the device is in state \verb|UP|. -Though neither the \verb|ip| utility nor the kernel check for this condition. -You can get unpredictable results changing this flag while the -device is running. -\end{NB} - -\item \verb|multicast on| or \verb|multicast off| - ---- change the \verb|MULTICAST| flag on the device. - -\item \verb|dynamic on| or \verb|dynamic off| - ---- change the \verb|DYNAMIC| flag on the device. - -\item \verb|name NAME| - ---- change the name of the device. This operation is not -recommended if the device is running or has some addresses -already configured. - -\item \verb|txqueuelen NUMBER| or \verb|txqlen NUMBER| - ---- change the transmit queue length of the device. - -\item \verb|mtu NUMBER| - ---- change the MTU of the device. - -\item \verb|address LLADDRESS| - ---- change the station address of the interface. - -\item \verb|broadcast LLADDRESS|, \verb|brd LLADDRESS| or \verb|peer LLADDRESS| - ---- change the link layer broadcast address or the peer address when -the interface is \verb|POINTOPOINT|. - -\vskip 1mm -\begin{NB} -For most devices (f.e.\ for Ethernet) changing the link layer -broadcast address will break networking. -Do not use it, if you do not understand what this operation really does. -\end{NB} - -\item \verb|netns PID| - ---- move the device to the network namespace associated with the process PID. - -\end{itemize} - -\vskip 1mm -\begin{NB} -The \verb|PROMISC| and \verb|ALLMULTI| flags are considered -obsolete and should not be changed administratively, though -the {\tt ip} utility will allow that. -\end{NB} - -\paragraph{Warning:} If multiple parameter changes are requested, -\verb|ip| aborts immediately after any of the changes have failed. -This is the only case when \verb|ip| can move the system to -an unpredictable state. The solution is to avoid changing -several parameters with one {\tt ip link set} call. - -\paragraph{Examples:} -\begin{itemize} -\item \verb|ip link set dummy address 00:00:00:00:00:01| - ---- change the station address of the interface \verb|dummy|. - -\item \verb|ip link set dummy up| - ---- start the interface \verb|dummy|. - -\end{itemize} - - -\subsection{{\tt ip link show} --- display device attributes} -\label{IP-LINK-SHOW} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|, -\verb|l|. - -\paragraph{Arguments:} -\begin{itemize} -\item \verb|dev NAME| (default) - ---- \verb|NAME| specifies the network device to show. -If this argument is omitted all devices are listed. - -\item \verb|up| - ---- only display running interfaces. - -\end{itemize} - - -\paragraph{Output format:} - -\begin{verbatim} -kuznet@alisa:~ $ ip link ls eth0 -3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 - link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff -kuznet@alisa:~ $ ip link ls sit0 -5: sit0@NONE: <NOARP,UP> mtu 1480 qdisc noqueue - link/sit 0.0.0.0 brd 0.0.0.0 -kuznet@alisa:~ $ ip link ls dummy -2: dummy: <BROADCAST,NOARP> mtu 1500 qdisc noop - link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff -kuznet@alisa:~ $ -\end{verbatim} - - -The number before each colon is an {\em interface index\/} or {\em ifindex\/}. -This number uniquely identifies the interface. This is followed by the {\em interface name\/} -(\verb|eth0|, \verb|sit0| etc.). The interface name is also -unique at every given moment. However, the interface may disappear from the -list (f.e.\ when the corresponding driver module is unloaded) and another -one with the same name may be created later. Besides that, -the administrator may change the name of any device with -\verb|ip| \verb|link| \verb|set| \verb|name| -to make it more intelligible. - -The interface name may have another name or \verb|NONE| appended -after the \verb|@| sign. This means that this device is bound to some other -device, -i.e.\ packets send through it are encapsulated and sent via the ``master'' -device. If the name is \verb|NONE|, the master is unknown. - -Then we see the interface {\em mtu\/} (``maximal transfer unit''). This determines -the maximal size of data which can be sent as a single packet over this interface. - -{\em qdisc\/} (``queuing discipline'') shows the queuing algorithm used -on the interface. Particularly, \verb|noqueue| means that this interface -does not queue anything and \verb|noop| means that the interface is in blackhole -mode i.e.\ all packets sent to it are immediately discarded. -{\em qlen\/} is the default transmit queue length of the device measured -in packets. - -The interface flags are summarized in the angle brackets. - -\begin{itemize} -\item \verb|UP| --- the device is turned on. It is ready to accept -packets for transmission and it may inject into the kernel packets received -from other nodes on the network. - -\item \verb|LOOPBACK| --- the interface does not communicate with other -hosts. All packets sent through it will be returned -and nothing but bounced packets can be received. - -\item \verb|BROADCAST| --- the device has the facility to send packets -to all hosts sharing the same link. A typical example is an Ethernet link. - -\item \verb|POINTOPOINT| --- the link has only two ends with one node -attached to each end. All packets sent to this link will reach the peer -and all packets received by us came from this single peer. - -If neither \verb|LOOPBACK| nor \verb|BROADCAST| nor \verb|POINTOPOINT| -are set, the interface is assumed to be NMBA (Non-Broadcast Multi-Access). -This is the most generic type of device and the most complicated one, because -the host attached to a NBMA link has no means to send to anyone -without additionally configured information. - -\item \verb|MULTICAST| --- is an advisory flag indicating that the interface -is aware of multicasting i.e.\ sending packets to some subset of neighbouring -nodes. Broadcasting is a particular case of multicasting, where the multicast -group consists of all nodes on the link. It is important to emphasize -that software {\em must not\/} interpret the absence of this flag as the inability -to use multicasting on this interface. Any \verb|POINTOPOINT| and -\verb|BROADCAST| link is multicasting by definition, because we have -direct access to all the neighbours and, hence, to any part of them. -Certainly, the use of high bandwidth multicast transfers is not recommended -on broadcast-only links because of high expense, but it is not strictly -prohibited. - -\item \verb|PROMISC| --- the device listens to and feeds to the kernel all -traffic on the link even if it is not destined for us, not broadcasted -and not destined for a multicast group of which we are member. Usually -this mode exists only on broadcast links and is used by bridges and for network -monitoring. - -\item \verb|ALLMULTI| --- the device receives all multicast packets -wandering on the link. This mode is used by multicast routers. - -\item \verb|NOARP| --- this flag is different from the other ones. It has -no invariant value and its interpretation depends on the network protocols -involved. As a rule, it indicates that the device needs no address -resolution and that the software or hardware knows how to deliver packets -without any help from the protocol stacks. - -\item \verb|DYNAMIC| --- is an advisory flag indicating that the interface is -dynamically created and destroyed. - -\item \verb|SLAVE| --- this interface is bonded to some other interfaces -to share link capacities. - -\end{itemize} - -\vskip 1mm -\begin{NB} -There are other flags but they are either obsolete (\verb|NOTRAILERS|) -or not implemented (\verb|DEBUG|) or specific to some devices -(\verb|MASTER|, \verb|AUTOMEDIA| and \verb|PORTSEL|). We do not discuss -them here. -\end{NB} - - -The second line contains information on the link layer addresses -associated with the device. The first word (\verb|ether|, \verb|sit|) -defines the interface hardware type. This type determines the format and semantics -of the addresses and is logically part of the address. -The default format of the station address and the broadcast address -(or the peer address for pointopoint links) is a -sequence of hexadecimal bytes separated by colons, but some link -types may have their natural address format, f.e.\ addresses -of tunnels over IP are printed as dotted-quad IP addresses. - -\vskip 1mm -\begin{NB} - NBMA links have no well-defined broadcast or peer address, - however this field may contain useful information, f.e.\ - about the address of broadcast relay or about the address of the ARP server. -\end{NB} -\begin{NB} -Multicast addresses are not shown by this command, see -\verb|ip maddr ls| in~Sec.\ref{IP-MADDR} (p.\pageref{IP-MADDR} of this -document). -\end{NB} - - -\paragraph{Statistics:} With the \verb|-statistics| option, \verb|ip| also -prints interface statistics: - -\begin{verbatim} -kuznet@alisa:~ $ ip -s link ls eth0 -3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 - link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff - RX: bytes packets errors dropped overrun mcast - 2449949362 2786187 0 0 0 0 - TX: bytes packets errors dropped carrier collsns - 178558497 1783945 332 0 332 35172 -kuznet@alisa:~ $ -\end{verbatim} -\verb|RX:| and \verb|TX:| lines summarize receiver and transmitter -statistics. They contain: -\begin{itemize} -\item \verb|bytes| --- the total number of bytes received or transmitted -on the interface. This number wraps when the maximal length of the data type -natural for the architecture is exceeded, so continuous monitoring requires -a user level daemon snapping it periodically. -\item \verb|packets| --- the total number of packets received or transmitted -on the interface. -\item \verb|errors| --- the total number of receiver or transmitter errors. -\item \verb|dropped| --- the total number of packets dropped due to lack -of resources. -\item \verb|overrun| --- the total number of receiver overruns resulting -in dropped packets. As a rule, if the interface is overrun, it means -serious problems in the kernel or that your machine is too slow -for this interface. -\item \verb|mcast| --- the total number of received multicast packets. This option -is only supported by a few devices. -\item \verb|carrier| --- total number of link media failures f.e.\ because -of lost carrier. -\item \verb|collsns| --- the total number of collision events -on Ethernet-like media. This number may have a different sense on other -link types. -\item \verb|compressed| --- the total number of compressed packets. This is -available only for links using VJ header compression. -\end{itemize} - - -If the \verb|-s| option is entered twice or more, -\verb|ip| prints more detailed statistics on receiver -and transmitter errors. - -\begin{verbatim} -kuznet@alisa:~ $ ip -s -s link ls eth0 -3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 - link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff - RX: bytes packets errors dropped overrun mcast - 2449949362 2786187 0 0 0 0 - RX errors: length crc frame fifo missed - 0 0 0 0 0 - TX: bytes packets errors dropped carrier collsns - 178558497 1783945 332 0 332 35172 - TX errors: aborted fifo window heartbeat - 0 0 0 332 -kuznet@alisa:~ $ -\end{verbatim} -These error names are pure Ethernetisms. Other devices -may have non zero values in these fields but they may be -interpreted differently. - - -\section{{\tt ip address} --- protocol address management} - -\paragraph{Abbreviations:} \verb|address|, \verb|addr|, \verb|a|. - -\paragraph{Object:} The \verb|address| is a protocol (IP or IPv6) address attached -to a network device. Each device must have at least one address -to use the corresponding protocol. It is possible to have several -different addresses attached to one device. These addresses are not -discriminated, so that the term {\em alias\/} is not quite appropriate -for them and we do not use it in this document. - -The \verb|ip addr| command displays addresses and their properties, -adds new addresses and deletes old ones. - -\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|flush| and \verb|show| -(or \verb|list|). - - -\subsection{{\tt ip address add} --- add a new protocol address} -\label{IP-ADDR-ADD} - -\paragraph{Abbreviations:} \verb|add|, \verb|a|. - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|dev NAME| - -\noindent--- the name of the device to add the address to. - -\item \verb|local ADDRESS| (default) - ---- the address of the interface. The format of the address depends -on the protocol. It is a dotted quad for IP and a sequence of hexadecimal halfwords -separated by colons for IPv6. The \verb|ADDRESS| may be followed by -a slash and a decimal number which encodes the network prefix length. - - -\item \verb|peer ADDRESS| - ---- the address of the remote endpoint for pointopoint interfaces. -Again, the \verb|ADDRESS| may be followed by a slash and a decimal number, -encoding the network prefix length. If a peer address is specified, -the local address {\em cannot\/} have a prefix length. The network prefix is associated -with the peer rather than with the local address. - - -\item \verb|broadcast ADDRESS| - ---- the broadcast address on the interface. - -It is possible to use the special symbols \verb|'+'| and \verb|'-'| -instead of the broadcast address. In this case, the broadcast address -is derived by setting/resetting the host bits of the interface prefix. - -\vskip 1mm -\begin{NB} -Unlike \verb|ifconfig|, the \verb|ip| utility {\em does not\/} set any broadcast -address unless explicitly requested. -\end{NB} - - -\item \verb|label NAME| - ---- Each address may be tagged with a label string. -In order to preserve compatibility with Linux-2.0 net aliases, -this string must coincide with the name of the device or must be prefixed -with the device name followed by colon. - - -\item \verb|scope SCOPE_VALUE| - ---- the scope of the area where this address is valid. -The available scopes are listed in file \verb|/etc/iproute2/rt_scopes|. -Predefined scope values are: - - \begin{itemize} - \item \verb|global| --- the address is globally valid. - \item \verb|site| --- (IPv6 only) the address is site local, - i.e.\ it is valid inside this site. - \item \verb|link| --- the address is link local, i.e.\ - it is valid only on this device. - \item \verb|host| --- the address is valid only inside this host. - \end{itemize} - -Appendix~\ref{ADDR-SEL} (p.\pageref{ADDR-SEL} of this document) -contains more details on address scopes. - -\end{itemize} - -\paragraph{Examples:} -\begin{itemize} -\item \verb|ip addr add 127.0.0.1/8 dev lo brd + scope host| - ---- add the usual loopback address to the loopback device. - -\item \verb|ip addr add 10.0.0.1/24 brd + dev eth0 label eth0:Alias| - ---- add the address 10.0.0.1 with prefix length 24 (i.e.\ netmask -\verb|255.255.255.0|), standard broadcast and label \verb|eth0:Alias| -to the interface \verb|eth0|. -\end{itemize} - - -\subsection{{\tt ip address delete} --- delete a protocol address} - -\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|. - -\paragraph{Arguments:} coincide with the arguments of \verb|ip addr add|. -The device name is a required argument. The rest are optional. -If no arguments are given, the first address is deleted. - -\paragraph{Examples:} -\begin{itemize} -\item \verb|ip addr del 127.0.0.1/8 dev lo| - ---- deletes the loopback address from the loopback device. -It would be best not to repeat this experiment. - -\item Disable IP on the interface \verb|eth0|: -\begin{verbatim} - while ip -f inet addr del dev eth0; do - : nothing - done -\end{verbatim} -Another method to disable IP on an interface using {\tt ip addr flush} -may be found in sec.\ref{IP-ADDR-FLUSH}, p.\pageref{IP-ADDR-FLUSH}. - -\end{itemize} - - -\subsection{{\tt ip address show} --- display protocol addresses} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|, -\verb|l|. - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|dev NAME| (default) - ---- the name of the device. - -\item \verb|scope SCOPE_VAL| - ---- only list addresses with this scope. - -\item \verb|to PREFIX| - ---- only list addresses matching this prefix. - -\item \verb|label PATTERN| - ---- only list addresses with labels matching the \verb|PATTERN|. -\verb|PATTERN| is a usual shell style pattern. - - -\item \verb|dynamic| and \verb|permanent| - ---- (IPv6 only) only list addresses installed due to stateless -address configuration or only list permanent (not dynamic) addresses. - -\item \verb|tentative| - ---- (IPv6 only) only list addresses which did not pass duplicate -address detection. - -\item \verb|deprecated| - ---- (IPv6 only) only list deprecated addresses. - - -\item \verb|primary| and \verb|secondary| - ---- only list primary (or secondary) addresses. - -\end{itemize} - - -\paragraph{Output format:} - -\begin{verbatim} -kuznet@alisa:~ $ ip addr ls eth0 -3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 - link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff - inet 193.233.7.90/24 brd 193.233.7.255 scope global eth0 - inet6 3ffe:2400:0:1:2a0:ccff:fe66:1878/64 scope global dynamic - valid_lft forever preferred_lft 604746sec - inet6 fe80::2a0:ccff:fe66:1878/10 scope link -kuznet@alisa:~ $ -\end{verbatim} - -The first two lines coincide with the output of \verb|ip link ls|. -It is natural to interpret link layer addresses -as addresses of the protocol family \verb|AF_PACKET|. - -Then the list of IP and IPv6 addresses follows, accompanied by -additional address attributes: scope value (see Sec.\ref{IP-ADDR-ADD}, -p.\pageref{IP-ADDR-ADD} above), flags and the address label. - -Address flags are set by the kernel and cannot be changed -administratively. Currently, the following flags are defined: - -\begin{enumerate} -\item \verb|secondary| - ---- the address is not used when selecting the default source address -of outgoing packets (Cf.\ Appendix~\ref{ADDR-SEL}, p.\pageref{ADDR-SEL}.). -An IP address becomes secondary if another address with the same -prefix bits already exists. The first address is primary. -It is the leader of the group of all secondary addresses. When the leader -is deleted, all secondaries are purged too. -There is a tweak in \verb|/proc/sys/net/ipv4/conf/<dev>/promote_secondaries| -which activate secondaries promotion when a primary is deleted. -To permanently enable this feature on all devices add -\verb|net.ipv4.conf.all.promote_secondaries=1| to \verb|/etc/sysctl.conf|. -This tweak is available in linux 2.6.15 and later. - - -\item \verb|dynamic| - ---- the address was created due to stateless autoconfiguration~\cite{RFC-ADDRCONF}. -In this case the output also contains information on times, when -the address is still valid. After \verb|preferred_lft| expires the address is -moved to the deprecated state. After \verb|valid_lft| expires the address -is finally invalidated. - -\item \verb|deprecated| - ---- the address is deprecated, i.e.\ it is still valid, but cannot -be used by newly created connections. - -\item \verb|tentative| - ---- the address is not used because duplicate address detection~\cite{RFC-ADDRCONF} -is still not complete or failed. - -\end{enumerate} - - -\subsection{{\tt ip address flush} --- flush protocol addresses} -\label{IP-ADDR-FLUSH} - -\paragraph{Abbreviations:} \verb|flush|, \verb|f|. - -\paragraph{Description:}This command flushes the protocol addresses -selected by some criteria. - -\paragraph{Arguments:} This command has the same arguments as \verb|show|. -The difference is that it does not run when no arguments are given. - -\paragraph{Warning:} This command (and other \verb|flush| commands -described below) is pretty dangerous. If you make a mistake, it will -not forgive it, but will cruelly purge all the addresses. - -\paragraph{Statistics:} With the \verb|-statistics| option, the command -becomes verbose. It prints out the number of deleted addresses and the number -of rounds made to flush the address list. If this option is given -twice, \verb|ip addr flush| also dumps all the deleted addresses -in the format described in the previous subsection. - -\paragraph{Example:} Delete all the addresses from the private network -10.0.0.0/8: -\begin{verbatim} -netadm@amber:~ # ip -s -s a f to 10/8 -2: dummy inet 10.7.7.7/16 brd 10.7.255.255 scope global dummy -3: eth0 inet 10.10.7.7/16 brd 10.10.255.255 scope global eth0 -4: eth1 inet 10.8.7.7/16 brd 10.8.255.255 scope global eth1 - -*** Round 1, deleting 3 addresses *** -*** Flush is complete after 1 round *** -netadm@amber:~ # -\end{verbatim} -Another instructive example is disabling IP on all the Ethernets: -\begin{verbatim} -netadm@amber:~ # ip -4 addr flush label "eth*" -\end{verbatim} -And the last example shows how to flush all the IPv6 addresses -acquired by the host from stateless address autoconfiguration -after you enabled forwarding or disabled autoconfiguration. -\begin{verbatim} -netadm@amber:~ # ip -6 addr flush dynamic -\end{verbatim} - - - -\section{{\tt ip neighbour} --- neighbour/arp tables management} - -\paragraph{Abbreviations:} \verb|neighbour|, \verb|neighbor|, \verb|neigh|, -\verb|n|. - -\paragraph{Object:} \verb|neighbour| objects establish bindings between protocol -addresses and link layer addresses for hosts sharing the same link. -Neighbour entries are organized into tables. The IPv4 neighbour table -is known by another name --- the ARP table. - -The corresponding commands display neighbour bindings -and their properties, add new neighbour entries and delete old ones. - -\paragraph{Commands:} \verb|add|, \verb|change|, \verb|replace|, -\verb|delete|, \verb|flush| and \verb|show| (or \verb|list|). - -\paragraph{See also:} Appendix~\ref{PROXY-NEIGH}, p.\pageref{PROXY-NEIGH} -describes how to manage proxy ARP/NDISC with the \verb|ip| utility. - - -\subsection{{\tt ip neighbour add} --- add a new neighbour entry\\ - {\tt ip neighbour change} --- change an existing entry\\ - {\tt ip neighbour replace} --- add a new entry or change an existing one} - -\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|; -\verb|replace|, \verb|repl|. - -\paragraph{Description:} These commands create new neighbour records -or update existing ones. - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|to ADDRESS| (default) - ---- the protocol address of the neighbour. It is either an IPv4 or IPv6 address. - -\item \verb|dev NAME| - ---- the interface to which this neighbour is attached. - - -\item \verb|lladdr LLADDRESS| - ---- the link layer address of the neighbour. \verb|LLADDRESS| can also be -\verb|null|. - -\item \verb|nud NUD_STATE| - ---- the state of the neighbour entry. \verb|nud| is an abbreviation for ``Neighbour -Unreachability Detection''. The state can take one of the following values: - -\begin{enumerate} -\item \verb|permanent| --- the neighbour entry is valid forever and can be only be removed -administratively. -\item \verb|noarp| --- the neighbour entry is valid. No attempts to validate -this entry will be made but it can be removed when its lifetime expires. -\item \verb|reachable| --- the neighbour entry is valid until the reachability -timeout expires. -\item \verb|stale| --- the neighbour entry is valid but suspicious. -This option to \verb|ip neigh| does not change the neighbour state if -it was valid and the address is not changed by this command. -\end{enumerate} - -\end{itemize} - -\paragraph{Examples:} -\begin{itemize} -\item \verb|ip neigh add 10.0.0.3 lladdr 0:0:0:0:0:1 dev eth0 nud perm| - ---- add a permanent ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|. - -\item \verb|ip neigh chg 10.0.0.3 dev eth0 nud reachable| - ---- change its state to \verb|reachable|. -\end{itemize} - - -\subsection{{\tt ip neighbour delete} --- delete a neighbour entry} - -\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|. - -\paragraph{Description:} This command invalidates a neighbour entry. - -\paragraph{Arguments:} The arguments are the same as with \verb|ip neigh add|, -except that \verb|lladdr| and \verb|nud| are ignored. - - -\paragraph{Example:} -\begin{itemize} -\item \verb|ip neigh del 10.0.0.3 dev eth0| - ---- invalidate an ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|. - -\end{itemize} - -\begin{NB} - The deleted neighbour entry will not disappear from the tables - immediately. If it is in use it cannot be deleted until the last - client releases it. Otherwise it will be destroyed during - the next garbage collection. -\end{NB} - - -\paragraph{Warning:} Attempts to delete or manually change -a \verb|noarp| entry created by the kernel may result in unpredictable behaviour. -Particularly, the kernel may try to resolve this address even -on a \verb|NOARP| interface or if the address is multicast or broadcast. - - -\subsection{{\tt ip neighbour show} --- list neighbour entries} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|. - -\paragraph{Description:}This commands displays neighbour tables. - -\paragraph{Arguments:} - -\begin{itemize} - -\item \verb|to ADDRESS| (default) - ---- the prefix selecting the neighbours to list. - -\item \verb|dev NAME| - ---- only list the neighbours attached to this device. - -\item \verb|unused| - ---- only list neighbours which are not currently in use. - -\item \verb|nud NUD_STATE| - ---- only list neighbour entries in this state. \verb|NUD_STATE| takes -values listed below or the special value \verb|all| which means all states. -This option may occur more than once. If this option is absent, \verb|ip| -lists all entries except for \verb|none| and \verb|noarp|. - -\end{itemize} - - -\paragraph{Output format:} - -\begin{verbatim} -kuznet@alisa:~ $ ip neigh ls -:: dev lo lladdr 00:00:00:00:00:00 nud noarp -fe80::200:cff:fe76:3f85 dev eth0 lladdr 00:00:0c:76:3f:85 router \ - nud stale -0.0.0.0 dev lo lladdr 00:00:00:00:00:00 nud noarp -193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 nud reachable -193.233.7.85 dev eth0 lladdr 00:e0:1e:63:39:00 nud stale -kuznet@alisa:~ $ -\end{verbatim} - -The first word of each line is the protocol address of the neighbour. -Then the device name follows. The rest of the line describes the contents of -the neighbour entry identified by the pair (device, address). - -\verb|lladdr| is the link layer address of the neighbour. - -\verb|nud| is the state of the ``neighbour unreachability detection'' machine -for this entry. The detailed description of the neighbour -state machine can be found in~\cite{RFC-NDISC}. Here is the full list -of the states with short descriptions: - -\begin{enumerate} -\item\verb|none| --- the state of the neighbour is void. -\item\verb|incomplete| --- the neighbour is in the process of resolution. -\item\verb|reachable| --- the neighbour is valid and apparently reachable. -\item\verb|stale| --- the neighbour is valid, but is probably already -unreachable, so the kernel will try to check it at the first transmission. -\item\verb|delay| --- a packet has been sent to the stale neighbour and the kernel is waiting -for confirmation. -\item\verb|probe| --- the delay timer expired but no confirmation was received. -The kernel has started to probe the neighbour with ARP/NDISC messages. -\item\verb|failed| --- resolution has failed. -\item\verb|noarp| --- the neighbour is valid. No attempts to check the entry -will be made. -\item\verb|permanent| --- it is a \verb|noarp| entry, but only the administrator -may remove the entry from the neighbour table. -\end{enumerate} - -The link layer address is valid in all states except for \verb|none|, -\verb|failed| and \verb|incomplete|. - -IPv6 neighbours can be marked with the additional flag \verb|router| -which means that the neighbour introduced itself as an IPv6 router~\cite{RFC-NDISC}. - -\paragraph{Statistics:} The \verb|-statistics| option displays some usage -statistics, f.e.\ - -\begin{verbatim} -kuznet@alisa:~ $ ip -s n ls 193.233.7.254 -193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \ - nud reachable -kuznet@alisa:~ $ -\end{verbatim} - -Here \verb|ref| is the number of users of this entry -and \verb|used| is a triplet of time intervals in seconds -separated by slashes. In this case they show that: - -\begin{enumerate} -\item the entry was used 12 seconds ago. -\item the entry was confirmed 13 seconds ago. -\item the entry was updated 20 seconds ago. -\end{enumerate} - -\subsection{{\tt ip neighbour flush} --- flush neighbour entries} - -\paragraph{Abbreviations:} \verb|flush|, \verb|f|. - -\paragraph{Description:}This command flushes neighbour tables, selecting -entries to flush by some criteria. - -\paragraph{Arguments:} This command has the same arguments as \verb|show|. -The differences are that it does not run when no arguments are given, -and that the default neighbour states to be flushed do not include -\verb|permanent| and \verb|noarp|. - - -\paragraph{Statistics:} With the \verb|-statistics| option, the command -becomes verbose. It prints out the number of deleted neighbours and the number -of rounds made to flush the neighbour table. If the option is given -twice, \verb|ip neigh flush| also dumps all the deleted neighbours -in the format described in the previous subsection. - -\paragraph{Example:} -\begin{verbatim} -netadm@alisa:~ # ip -s -s n f 193.233.7.254 -193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \ - nud reachable - -*** Round 1, deleting 1 entries *** -*** Flush is complete after 1 round *** -netadm@alisa:~ # -\end{verbatim} - - -\section{{\tt ip route} --- routing table management} -\label{IP-ROUTE} - -\paragraph{Abbreviations:} \verb|route|, \verb|ro|, \verb|r|. - -\paragraph{Object:} \verb|route| entries in the kernel routing tables keep -information about paths to other networked nodes. - -Each route entry has a {\em key\/} consisting of a {\em prefix\/} -(i.e.\ a pair containing a network address and the length of its mask) and, -optionally, the TOS value. An IP packet matches the route if the highest -bits of its destination address are equal to the route prefix at least -up to the prefix length and if the TOS of the route is zero or equal to -the TOS of the packet. - -If several routes match the packet, the following pruning rules -are used to select the best one (see~\cite{RFC1812}): -\begin{enumerate} -\item The longest matching prefix is selected. All shorter ones -are dropped. - -\item If the TOS of some route with the longest prefix is equal to the TOS -of the packet, the routes with different TOS are dropped. - -If no exact TOS match was found and routes with TOS=0 exist, -the rest of routes are pruned. - -Otherwise, the route lookup fails. - -\item If several routes remain after the previous steps, then -the routes with the best preference values are selected. - -\item If we still have several routes, then the {\em first\/} of them -is selected. - -\begin{NB} - Note the ambiguity of the last step. Unfortunately, Linux - historically allows such a bizarre situation. The sense of the -word ``first'' depends on the order of route additions and it is practically -impossible to maintain a bundle of such routes in this order. -\end{NB} - -For simplicity we will limit ourselves to the case where such a situation -is impossible and routes are uniquely identified by the triplet -\{prefix, tos, preference\}. Actually, it is impossible to create -non-unique routes with \verb|ip| commands described in this section. - -One useful exception to this rule is the default route on non-forwarding -hosts. It is ``officially'' allowed to have several fallback routes -when several routers are present on directly connected networks. -In this case, Linux-2.2 makes ``dead gateway detection''~\cite{RFC1122} -controlled by neighbour unreachability detection and by advice -from transport protocols to select a working router, so the order -of the routes is not essential. However, in this case, -fiddling with default routes manually is not recommended. Use the Router Discovery -protocol (see Appendix~\ref{EXAMPLE-SETUP}, p.\pageref{EXAMPLE-SETUP}) -instead. Actually, Linux-2.2 IPv6 does not give user level applications -any access to default routes. -\end{enumerate} - -Certainly, the steps above are not performed exactly -in this sequence. Instead, the routing table in the kernel is kept -in some data structure to achieve the final result -with minimal cost. However, not depending on a particular -routing algorithm implemented in the kernel, we can summarize -the statements above as: a route is identified by the triplet -\{prefix, tos, preference\}. This {\em key\/} lets us locate -the route in the routing table. - -\paragraph{Route attributes:} Each route key refers to a routing -information record containing -the data required to deliver IP packets (f.e.\ output device and -next hop router) and some optional attributes (f.e. the path MTU or -the preferred source address when communicating with this destination). -These attributes are described in the following subsection. - -\paragraph{Route types:} \label{IP-ROUTE-TYPES} -It is important that the set -of required and optional attributes depend on the route {\em type\/}. -The most important route type -is \verb|unicast|. It describes real paths to other hosts. -As a rule, common routing tables contain only such routes. However, -there are other types of routes with different semantics. The -full list of types understood by Linux-2.2 is: -\begin{itemize} -\item \verb|unicast| --- the route entry describes real paths to the -destinations covered by the route prefix. -\item \verb|unreachable| --- these destinations are unreachable. Packets -are discarded and the ICMP message {\em host unreachable\/} is generated. -The local senders get an \verb|EHOSTUNREACH| error. -\item \verb|blackhole| --- these destinations are unreachable. Packets -are discarded silently. The local senders get an \verb|EINVAL| error. -\item \verb|prohibit| --- these destinations are unreachable. Packets -are discarded and the ICMP message {\em communication administratively -prohibited\/} is generated. The local senders get an \verb|EACCES| error. -\item \verb|local| --- the destinations are assigned to this -host. The packets are looped back and delivered locally. -\item \verb|broadcast| --- the destinations are broadcast addresses. -The packets are sent as link broadcasts. -\item \verb|throw| --- a special control route used together with policy -rules (see sec.\ref{IP-RULE}, p.\pageref{IP-RULE}). If such a route is selected, lookup -in this table is terminated pretending that no route was found. -Without policy routing it is equivalent to the absence of the route in the routing -table. The packets are dropped and the ICMP message {\em net unreachable\/} -is generated. The local senders get an \verb|ENETUNREACH| error. -\item \verb|nat| --- a special NAT route. Destinations covered by the prefix -are considered to be dummy (or external) addresses which require translation -to real (or internal) ones before forwarding. The addresses to translate to -are selected with the attribute \verb|via|. More about NAT is -in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}. -\item \verb|anycast| --- ({\em not implemented\/}) the destinations are -{\em anycast\/} addresses assigned to this host. They are mainly equivalent -to \verb|local| with one difference: such addresses are invalid when used -as the source address of any packet. -\item \verb|multicast| --- a special type used for multicast routing. -It is not present in normal routing tables. -\end{itemize} - -\paragraph{Route tables:} Linux-2.2 can pack routes into several routing -tables identified by a number in the range from 1 to 255 or by -name from the file \verb|/etc/iproute2/rt_tables|. By default all normal -routes are inserted into the \verb|main| table (ID 254) and the kernel only uses -this table when calculating routes. - -Actually, one other table always exists, which is invisible but -even more important. It is the \verb|local| table (ID 255). This table -consists of routes for local and broadcast addresses. The kernel maintains -this table automatically and the administrator usually need not modify it -or even look at it. - -The multiple routing tables enter the game when {\em policy routing\/} -is used. See sec.\ref{IP-RULE}, p.\pageref{IP-RULE}. -In this case, the table identifier effectively becomes -one more parameter, which should be added to the triplet -\{prefix, tos, preference\} to uniquely identify the route. - - -\subsection{{\tt ip route add} --- add a new route\\ - {\tt ip route change} --- change a route\\ - {\tt ip route replace} --- change a route or add a new one} -\label{IP-ROUTE-ADD} - -\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|; - \verb|replace|, \verb|repl|. - - -\paragraph{Arguments:} -\begin{itemize} -\item \verb|to PREFIX| or \verb|to TYPE PREFIX| (default) - ---- the destination prefix of the route. If \verb|TYPE| is omitted, -\verb|ip| assumes type \verb|unicast|. Other values of \verb|TYPE| -are listed above. \verb|PREFIX| is an IP or IPv6 address optionally followed -by a slash and the prefix length. If the length of the prefix is missing, -\verb|ip| assumes a full-length host route. There is also a special -\verb|PREFIX| --- \verb|default| --- which is equivalent to IP \verb|0/0| or -to IPv6 \verb|::/0|. - -\item \verb|tos TOS| or \verb|dsfield TOS| - ---- the Type Of Service (TOS) key. This key has no associated mask and -the longest match is understood as: First, compare the TOS -of the route and of the packet. If they are not equal, then the packet -may still match a route with a zero TOS. \verb|TOS| is either an 8 bit hexadecimal -number or an identifier from {\tt /etc/iproute2/rt\_dsfield}. - - -\item \verb|metric NUMBER| or \verb|preference NUMBER| - ---- the preference value of the route. \verb|NUMBER| is an arbitrary 32bit number. - -\item \verb|table TABLEID| - ---- the table to add this route to. -\verb|TABLEID| may be a number or a string from the file -\verb|/etc/iproute2/rt_tables|. If this parameter is omitted, -\verb|ip| assumes the \verb|main| table, with the exception of -\verb|local|, \verb|broadcast| and \verb|nat| routes, which are -put into the \verb|local| table by default. - -\item \verb|dev NAME| - ---- the output device name. - -\item \verb|via ADDRESS| - ---- the address of the nexthop router. Actually, the sense of this field depends -on the route type. For normal \verb|unicast| routes it is either the true nexthop -router or, if it is a direct route installed in BSD compatibility mode, -it can be a local address of the interface. -For NAT routes it is the first address of the block of translated IP destinations. - -\item \verb|src ADDRESS| - ---- the source address to prefer when sending to the destinations -covered by the route prefix. - -\item \verb|realm REALMID| - ---- the realm to which this route is assigned. -\verb|REALMID| may be a number or a string from the file -\verb|/etc/iproute2/rt_realms|. Sec.\ref{RT-REALMS} (p.\pageref{RT-REALMS}) -contains more information on realms. - -\item \verb|mtu MTU| or \verb|mtu lock MTU| - ---- the MTU along the path to the destination. If the modifier \verb|lock| is -not used, the MTU may be updated by the kernel due to Path MTU Discovery. -If the modifier \verb|lock| is used, no path MTU discovery will be tried, -all packets will be sent without the DF bit in IPv4 case -or fragmented to MTU for IPv6. - -\item \verb|window NUMBER| - ---- the maximal window for TCP to advertise to these destinations, -measured in bytes. It limits maximal data bursts that our TCP -peers are allowed to send to us. - -\item \verb|rtt NUMBER| - ---- the initial RTT (``Round Trip Time'') estimate. - - -\item \verb|rttvar NUMBER| - ---- \threeonly the initial RTT variance estimate. - - -\item \verb|ssthresh NUMBER| - ---- \threeonly an estimate for the initial slow start threshold. - - -\item \verb|cwnd NUMBER| - ---- \threeonly the clamp for congestion window. It is ignored if the \verb|lock| - flag is not used. - - -\item \verb|advmss NUMBER| - ---- \threeonly the MSS (``Maximal Segment Size'') to advertise to these - destinations when establishing TCP connections. If it is not given, - Linux uses a default value calculated from the first hop device MTU. - -\begin{NB} - If the path to these destination is asymmetric, this guess may be wrong. -\end{NB} - -\item \verb|reordering NUMBER| - ---- \threeonly Maximal reordering on the path to this destination. - If it is not given, Linux uses the value selected with \verb|sysctl| - variable \verb|net/ipv4/tcp_reordering|. - -\item \verb|hoplimit NUMBER| - ---- [2.5.74+ only] Maximum number of hops on the path to this destination. - The default is the value selected with the \verb|sysctl| variable - \verb|net/ipv4/ip_default_ttl|. - -\item \verb|initcwnd NUMBER| ---- [2.5.70+ only] Initial congestion window size for connections to - this destination. Actual window size is this value multiplied by the - MSS (``Maximal Segment Size'') for same connection. The default is - zero, meaning to use the values specified in~\cite{RFC2414}. - -+\item \verb|initrwnd NUMBER| - -+--- [2.6.33+ only] Initial receive window size for connections to -+ this destination. The actual window size is this value multiplied -+ by the MSS (''Maximal Segment Size'') of the connection. The default -+ value is zero, meaning to use Slow Start value. - -\item \verb|nexthop NEXTHOP| - ---- the nexthop of a multipath route. \verb|NEXTHOP| is a complex value -with its own syntax similar to the top level argument lists: -\begin{itemize} -\item \verb|via ADDRESS| is the nexthop router. -\item \verb|dev NAME| is the output device. -\item \verb|weight NUMBER| is a weight for this element of a multipath -route reflecting its relative bandwidth or quality. -\end{itemize} - -\item \verb|scope SCOPE_VAL| - ---- the scope of the destinations covered by the route prefix. -\verb|SCOPE_VAL| may be a number or a string from the file -\verb|/etc/iproute2/rt_scopes|. -If this parameter is omitted, -\verb|ip| assumes scope \verb|global| for all gatewayed \verb|unicast| -routes, scope \verb|link| for direct \verb|unicast| and \verb|broadcast| routes -and scope \verb|host| for \verb|local| routes. - -\item \verb|protocol RTPROTO| - ---- the routing protocol identifier of this route. -\verb|RTPROTO| may be a number or a string from the file -\verb|/etc/iproute2/rt_protos|. If the routing protocol ID is -not given, \verb|ip| assumes protocol \verb|boot| (i.e.\ -it assumes the route was added by someone who doesn't -understand what they are doing). Several protocol values have a fixed interpretation. -Namely: -\begin{itemize} -\item \verb|redirect| --- the route was installed due to an ICMP redirect. -\item \verb|kernel| --- the route was installed by the kernel during -autoconfiguration. -\item \verb|boot| --- the route was installed during the bootup sequence. -If a routing daemon starts, it will purge all of them. -\item \verb|static| --- the route was installed by the administrator -to override dynamic routing. Routing daemon will respect them -and, probably, even advertise them to its peers. -\item \verb|ra| --- the route was installed by Router Discovery protocol. -\end{itemize} -The rest of the values are not reserved and the administrator is free -to assign (or not to assign) protocol tags. At least, routing -daemons should take care of setting some unique protocol values, -f.e.\ as they are assigned in \verb|rtnetlink.h| or in \verb|rt_protos| -database. - - -\item \verb|onlink| - ---- pretend that the nexthop is directly attached to this link, -even if it does not match any interface prefix. One application of this -option may be found in~\cite{IP-TUNNELS}. - -\item \verb|pref PREF| - ---- the IPv6 route preference. -\verb|PREF| PREF is a string specifying the route preference as defined in -RFC4191 for Router Discovery messages. Namely: -\begin{itemize} -\item \verb|low| --- the route has a lowest priority. -\item \verb|medium| --- the route has a default priority. -\item \verb|high| --- the route has a highest priority. -\end{itemize} - -\end{itemize} - - -\begin{NB} - Actually there are more commands: \verb|prepend| does the same - thing as classic \verb|route add|, i.e.\ adds a route, even if another - route to the same destination exists. Its opposite case is \verb|append|, - which adds the route to the end of the list. Avoid these - features. -\end{NB} -\begin{NB} - More sad news, IPv6 only understands the \verb|append| command correctly. - All the others are translated into \verb|append| commands. Certainly, - this will change in the future. -\end{NB} - -\paragraph{Examples:} -\begin{itemize} -\item add a plain route to network 10.0.0/24 via gateway 193.233.7.65 -\begin{verbatim} - ip route add 10.0.0/24 via 193.233.7.65 -\end{verbatim} -\item change it to a direct route via the \verb|dummy| device -\begin{verbatim} - ip ro chg 10.0.0/24 dev dummy -\end{verbatim} -\item add a default multipath route splitting the load between \verb|ppp0| -and \verb|ppp1| -\begin{verbatim} - ip route add default scope global nexthop dev ppp0 \ - nexthop dev ppp1 -\end{verbatim} -Note the scope value. It is not necessary but it informs the kernel -that this route is gatewayed rather than direct. Actually, if you -know the addresses of remote endpoints it would be better to use the -\verb|via| parameter. -\item announce that the address 192.203.80.144 is not a real one, but -should be translated to 193.233.7.83 before forwarding -\begin{verbatim} - ip route add nat 192.203.80.144 via 193.233.7.83 -\end{verbatim} -Backward translation is setup with policy rules described -in the following section (sec.\ref{IP-RULE}, p.\pageref{IP-RULE}). -\end{itemize} - -\subsection{{\tt ip route delete} --- delete a route} - -\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|. - -\paragraph{Arguments:} \verb|ip route del| has the same arguments as -\verb|ip route add|, but their semantics are a bit different. - -Key values (\verb|to|, \verb|tos|, \verb|preference| and \verb|table|) -select the route to delete. If optional attributes are present, \verb|ip| -verifies that they coincide with the attributes of the route to delete. -If no route with the given key and attributes was found, \verb|ip route del| -fails. -\begin{NB} -Linux-2.0 had the option to delete a route selected only by prefix address, -ignoring its length (i.e.\ netmask). This option no longer exists -because it was ambiguous. However, look at {\tt ip route flush} -(sec.\ref{IP-ROUTE-FLUSH}, p.\pageref{IP-ROUTE-FLUSH}) which -provides similar and even richer functionality. -\end{NB} - -\paragraph{Example:} -\begin{itemize} -\item delete the multipath route created by the command in previous subsection -\begin{verbatim} - ip route del default scope global nexthop dev ppp0 \ - nexthop dev ppp1 -\end{verbatim} -\end{itemize} - - - -\subsection{{\tt ip route show} --- list routes} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. - -\paragraph{Description:} the command displays the contents of the routing tables -or the route(s) selected by some criteria. - - -\paragraph{Arguments:} -\begin{itemize} -\item \verb|to SELECTOR| (default) - ---- only select routes from the given range of destinations. \verb|SELECTOR| -consists of an optional modifier (\verb|root|, \verb|match| or \verb|exact|) -and a prefix. \verb|root PREFIX| selects routes with prefixes not shorter -than \verb|PREFIX|. F.e.\ \verb|root 0/0| selects the entire routing table. -\verb|match PREFIX| selects routes with prefixes not longer than -\verb|PREFIX|. F.e.\ \verb|match 10.0/16| selects \verb|10.0/16|, -\verb|10/8| and \verb|0/0|, but it does not select \verb|10.1/16| and -\verb|10.0.0/24|. And \verb|exact PREFIX| (or just \verb|PREFIX|) -selects routes with this exact prefix. If neither of these options -are present, \verb|ip| assumes \verb|root 0/0| i.e.\ it lists the entire table. - - -\item \verb|tos TOS| or \verb|dsfield TOS| - - --- only select routes with the given TOS. - - -\item \verb|table TABLEID| - - --- show the routes from this table(s). The default setting is to show -\verb|table| \verb|main|. \verb|TABLEID| may either be the ID of a real table -or one of the special values: - \begin{itemize} - \item \verb|all| --- list all of the tables. - \item \verb|cache| --- dump the routing cache. - \end{itemize} -\begin{NB} - IPv6 has a single table. However, splitting it into \verb|main|, \verb|local| - and \verb|cache| is emulated by the \verb|ip| utility. -\end{NB} - -\item \verb|cloned| or \verb|cached| - ---- list cloned routes i.e.\ routes which were dynamically forked from -other routes because some route attribute (f.e.\ MTU) was updated. -Actually, it is equivalent to \verb|table cache|. - -\item \verb|from SELECTOR| - ---- the same syntax as for \verb|to|, but it binds the source address range -rather than destinations. Note that the \verb|from| option only works with -cloned routes. - -\item \verb|protocol RTPROTO| - ---- only list routes of this protocol. - - -\item \verb|scope SCOPE_VAL| - ---- only list routes with this scope. - -\item \verb|type TYPE| - ---- only list routes of this type. - -\item \verb|dev NAME| - ---- only list routes going via this device. - -\item \verb|via PREFIX| - ---- only list routes going via the nexthop routers selected by \verb|PREFIX|. - -\item \verb|src PREFIX| - ---- only list routes with preferred source addresses selected -by \verb|PREFIX|. - -\item \verb|realm REALMID| or \verb|realms FROMREALM/TOREALM| - ---- only list routes with these realms. - -\end{itemize} - -\paragraph{Examples:} Let us count routes of protocol \verb|gated/bgp| -on a router: -\begin{verbatim} -kuznet@amber:~ $ ip ro ls proto gated/bgp | wc - 1413 9891 79010 -kuznet@amber:~ $ -\end{verbatim} -To count the size of the routing cache, we have to use the \verb|-o| option -because cached attributes can take more than one line of output: -\begin{verbatim} -kuznet@amber:~ $ ip -o ro ls cloned | wc - 159 2543 18707 -kuznet@amber:~ $ -\end{verbatim} - - -\paragraph{Output format:} The output of this command consists -of per route records separated by line feeds. -However, some records may consist -of more than one line: particularly, this is the case when the route -is cloned or you requested additional statistics. If the -\verb|-o| option was given, then line feeds separating lines inside -records are replaced with the backslash sign. - -The output has the same syntax as arguments given to {\tt ip route add}, -so that it can be understood easily. F.e.\ -\begin{verbatim} -kuznet@amber:~ $ ip ro ls 193.233.7/24 -193.233.7.0/24 dev eth0 proto gated/conn scope link \ - src 193.233.7.65 realms inr.ac -kuznet@amber:~ $ -\end{verbatim} - -If you list cloned entries, the output contains other attributes which -are evaluated during route calculation and updated during route -lifetime. An example of the output is: -\begin{verbatim} -kuznet@amber:~ $ ip ro ls 193.233.7.82 tab cache -193.233.7.82 from 193.233.7.82 dev eth0 src 193.233.7.65 \ - realms inr.ac/inr.ac - cache <src-direct,redirect> mtu 1500 rtt 300 iif eth0 -193.233.7.82 dev eth0 src 193.233.7.65 realms inr.ac - cache mtu 1500 rtt 300 -kuznet@amber:~ $ -\end{verbatim} -\begin{NB} - \label{NB-strange-route} - The route looks a bit strange, doesn't it? Did you notice that - it is a path from 193.233.7.82 back to 193.233.82? Well, you will - see in the section on \verb|ip route get| (p.\pageref{NB-nature-of-strangeness}) - how it appeared. -\end{NB} -The second line, starting with the word \verb|cache|, shows -additional attributes which normal routes do not possess. -Cached flags are summarized in angle brackets: -\begin{itemize} -\item \verb|local| --- packets are delivered locally. -It stands for loopback unicast routes, for broadcast routes -and for multicast routes, if this host is a member of the corresponding -group. - -\item \verb|reject| --- the path is bad. Any attempt to use it results -in an error. See attribute \verb|error| below (p.\pageref{IP-ROUTE-GET-error}). - -\item \verb|mc| --- the destination is multicast. - -\item \verb|brd| --- the destination is broadcast. - -\item \verb|src-direct| --- the source is on a directly connected -interface. - -\item \verb|redirected| --- the route was created by an ICMP Redirect. - -\item \verb|redirect| --- packets going via this route will -trigger an ICMP redirect. - -\item \verb|fastroute| --- the route is eligible to be used for fastroute. - -\item \verb|equalize| --- make packet by packet randomization -along this path. - -\item \verb|dst-nat| --- the destination address requires translation. - -\item \verb|src-nat| --- the source address requires translation. - -\item \verb|masq| --- the source address requires masquerading. -This feature disappeared in linux-2.4. - -\item \verb|notify| --- ({\em not implemented}) change/deletion -of this route will trigger RTNETLINK notification. -\end{itemize} - -Then some optional attributes follow: -\begin{itemize} -\item \verb|error| --- on \verb|reject| routes it is error code -returned to local senders when they try to use this route. -These error codes are translated into ICMP error codes, sent to remote -senders, according to the rules described above in the subsection -devoted to route types (p.\pageref{IP-ROUTE-TYPES}). -\label{IP-ROUTE-GET-error} - -\item \verb|expires| --- this entry will expire after this timeout. - -\item \verb|iif| --- the packets for this path are expected to arrive -on this interface. -\end{itemize} - -\paragraph{Statistics:} With the \verb|-statistics| option, more -information about this route is shown: -\begin{itemize} -\item \verb|users| --- the number of users of this entry. -\item \verb|age| --- shows when this route was last used. -\item \verb|used| --- the number of lookups of this route since its creation. -\end{itemize} - -\subsection{{\tt ip route save} -- save routing tables} -\label{IP-ROUTE-SAVE} - -\paragraph{Description:} this command saves the contents of the routing -tables or the route(s) selected by some criteria to standard output. - -\paragraph{Arguments:} \verb|ip route save| has the same arguments as -\verb|ip route show|. - -\paragraph{Example:} This saves all the routes to the {\tt saved\_routes} -file: -\begin{verbatim} -dan@caffeine:~ # ip route save > saved_routes -\end{verbatim} - -\paragraph{Output format:} The format of the data stream provided by -\verb|ip route save| is that of \verb|rtnetlink|. See -\verb|rtnetlink(7)| for more information. - -\subsection{{\tt ip route restore} -- restore routing tables} -\label{IP-ROUTE-RESTORE} - -\paragraph{Description:} this command restores the contents of the routing -tables according to a data stream as provided by \verb|ip route save| via -standard input. Note that any routes already in the table are left unchanged. -Any routes in the input stream that already exist in the tables are ignored. - -\paragraph{Arguments:} This command takes no arguments. - -\paragraph{Example:} This restores all routes that were saved to the -{\tt saved\_routes} file: - -\begin{verbatim} -dan@caffeine:~ # ip route restore < saved_routes -\end{verbatim} - -\subsection{{\tt ip route flush} --- flush routing tables} -\label{IP-ROUTE-FLUSH} - -\paragraph{Abbreviations:} \verb|flush|, \verb|f|. - -\paragraph{Description:} this command flushes routes selected -by some criteria. - -\paragraph{Arguments:} the arguments have the same syntax and semantics -as the arguments of \verb|ip route show|, but routing tables are not -listed but purged. The only difference is the default action: \verb|show| -dumps all the IP main routing table but \verb|flush| prints the helper page. -The reason for this difference does not require any explanation, does it? - - -\paragraph{Statistics:} With the \verb|-statistics| option, the command -becomes verbose. It prints out the number of deleted routes and the number -of rounds made to flush the routing table. If the option is given -twice, \verb|ip route flush| also dumps all the deleted routes -in the format described in the previous subsection. - -\paragraph{Examples:} The first example flushes all the -gatewayed routes from the main table (f.e.\ after a routing daemon crash). -\begin{verbatim} -netadm@amber:~ # ip -4 ro flush scope global type unicast -\end{verbatim} -This option deserves to be put into a scriptlet \verb|routef|. -\begin{NB} -This option was described in the \verb|route(8)| man page borrowed -from BSD, but was never implemented in Linux. -\end{NB} - -The second example flushes all IPv6 cloned routes: -\begin{verbatim} -netadm@amber:~ # ip -6 -s -s ro flush cache -3ffe:2400::220:afff:fef4:c5d1 via 3ffe:2400::220:afff:fef4:c5d1 \ - dev eth0 metric 0 - cache used 2 age 12sec mtu 1500 rtt 300 -3ffe:2400::280:adff:feb7:8034 via 3ffe:2400::280:adff:feb7:8034 \ - dev eth0 metric 0 - cache used 2 age 15sec mtu 1500 rtt 300 -3ffe:2400::280:c8ff:fe59:5bcc via 3ffe:2400::280:c8ff:fe59:5bcc \ - dev eth0 metric 0 - cache users 1 used 1 age 23sec mtu 1500 rtt 300 -3ffe:2400:0:1:2a0:ccff:fe66:1878 via 3ffe:2400:0:1:2a0:ccff:fe66:1878 \ - dev eth1 metric 0 - cache used 2 age 20sec mtu 1500 rtt 300 -3ffe:2400:0:1:a00:20ff:fe71:fb30 via 3ffe:2400:0:1:a00:20ff:fe71:fb30 \ - dev eth1 metric 0 - cache used 2 age 33sec mtu 1500 rtt 300 -ff02::1 via ff02::1 dev eth1 metric 0 - cache users 1 used 1 age 45sec mtu 1500 rtt 300 - -*** Round 1, deleting 6 entries *** -*** Flush is complete after 1 round *** -netadm@amber:~ # ip -6 -s -s ro flush cache -Nothing to flush. -netadm@amber:~ # -\end{verbatim} - -The third example flushes BGP routing tables after a \verb|gated| -death. -\begin{verbatim} -netadm@amber:~ # ip ro ls proto gated/bgp | wc - 1408 9856 78730 -netadm@amber:~ # ip -s ro f proto gated/bgp - -*** Round 1, deleting 1408 entries *** -*** Flush is complete after 1 round *** -netadm@amber:~ # ip ro f proto gated/bgp -Nothing to flush. -netadm@amber:~ # ip ro ls proto gated/bgp -netadm@amber:~ # -\end{verbatim} - - -\subsection{{\tt ip route get} --- get a single route} -\label{IP-ROUTE-GET} - -\paragraph{Abbreviations:} \verb|get|, \verb|g|. - -\paragraph{Description:} this command gets a single route to a destination -and prints its contents exactly as the kernel sees it. - -\paragraph{Arguments:} -\begin{itemize} -\item \verb|to ADDRESS| (default) - ---- the destination address. - -\item \verb|from ADDRESS| - ---- the source address. - -\item \verb|tos TOS| or \verb|dsfield TOS| - ---- the Type Of Service. - -\item \verb|iif NAME| - ---- the device from which this packet is expected to arrive. - -\item \verb|oif NAME| - ---- force the output device on which this packet will be routed. - -\item \verb|connected| - ---- if no source address (option \verb|from|) was given, relookup -the route with the source set to the preferred address received from the first lookup. -If policy routing is used, it may be a different route. - -\end{itemize} - -Note that this operation is not equivalent to \verb|ip route show|. -\verb|show| shows existing routes. \verb|get| resolves them and -creates new clones if necessary. Essentially, \verb|get| -is equivalent to sending a packet along this path. -If the \verb|iif| argument is not given, the kernel creates a route -to output packets towards the requested destination. -This is equivalent to pinging the destination -with a subsequent {\tt ip route ls cache}, however, no packets are -actually sent. With the \verb|iif| argument, the kernel pretends -that a packet arrived from this interface and searches for -a path to forward the packet. - -\paragraph{Output format:} This command outputs routes in the same -format as \verb|ip route ls|. - -\paragraph{Examples:} -\begin{itemize} -\item Find a route to output packets to 193.233.7.82: -\begin{verbatim} -kuznet@amber:~ $ ip route get 193.233.7.82 -193.233.7.82 dev eth0 src 193.233.7.65 realms inr.ac - cache mtu 1500 rtt 300 -kuznet@amber:~ $ -\end{verbatim} - -\item Find a route to forward packets arriving on \verb|eth0| -from 193.233.7.82 and destined for 193.233.7.82: -\begin{verbatim} -kuznet@amber:~ $ ip r g 193.233.7.82 from 193.233.7.82 iif eth0 -193.233.7.82 from 193.233.7.82 dev eth0 src 193.233.7.65 \ - realms inr.ac/inr.ac - cache <src-direct,redirect> mtu 1500 rtt 300 iif eth0 -kuznet@amber:~ $ -\end{verbatim} -\begin{NB} - \label{NB-nature-of-strangeness} - This is the command that created the funny route from 193.233.7.82 - looped back to 193.233.7.82 (cf.\ NB on~p.\pageref{NB-strange-route}). - Note the \verb|redirect| flag on it. -\end{NB} - -\item Find a multicast route for packets arriving on \verb|eth0| -from host 193.233.7.82 and destined for multicast group 224.2.127.254 -(it is assumed that a multicast routing daemon is running. -In this case, it is \verb|pimd|) -\begin{verbatim} -kuznet@amber:~ $ ip r g 224.2.127.254 from 193.233.7.82 iif eth0 -multicast 224.2.127.254 from 193.233.7.82 dev lo \ - src 193.233.7.65 realms inr.ac/cosmos - cache <mc> iif eth0 Oifs: eth1 pimreg -kuznet@amber:~ $ -\end{verbatim} -This route differs from the ones seen before. It contains a ``normal'' part -and a ``multicast'' part. The normal part is used to deliver (or not to -deliver) the packet to local IP listeners. In this case the router -is not a member -of this group, so that route has no \verb|local| flag and only -forwards packets. The output device for such entries is always loopback. -The multicast part consists of an additional \verb|Oifs:| list showing -the output interfaces. -\end{itemize} - - -It is time for a more complicated example. Let us add an invalid -gatewayed route for a destination which is really directly connected: -\begin{verbatim} -netadm@alisa:~ # ip route add 193.233.7.98 via 193.233.7.254 -netadm@alisa:~ # ip route get 193.233.7.98 -193.233.7.98 via 193.233.7.254 dev eth0 src 193.233.7.90 - cache mtu 1500 rtt 3072 -netadm@alisa:~ # -\end{verbatim} -and probe it with ping: -\begin{verbatim} -netadm@alisa:~ # ping -n 193.233.7.98 -PING 193.233.7.98 (193.233.7.98) from 193.233.7.90 : 56 data bytes -From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98) -64 bytes from 193.233.7.98: icmp_seq=0 ttl=255 time=3.5 ms -From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98) -64 bytes from 193.233.7.98: icmp_seq=1 ttl=255 time=2.2 ms -64 bytes from 193.233.7.98: icmp_seq=2 ttl=255 time=0.4 ms -64 bytes from 193.233.7.98: icmp_seq=3 ttl=255 time=0.4 ms -64 bytes from 193.233.7.98: icmp_seq=4 ttl=255 time=0.4 ms -^C ---- 193.233.7.98 ping statistics --- -5 packets transmitted, 5 packets received, 0% packet loss -round-trip min/avg/max = 0.4/1.3/3.5 ms -netadm@alisa:~ # -\end{verbatim} -What happened? Router 193.233.7.254 understood that we have a much -better path to the destination and sent us an ICMP redirect message. -We may retry \verb|ip route get| to see what we have in the routing -tables now: -\begin{verbatim} -netadm@alisa:~ # ip route get 193.233.7.98 -193.233.7.98 dev eth0 src 193.233.7.90 - cache <redirected> mtu 1500 rtt 3072 -netadm@alisa:~ # -\end{verbatim} - - - -\section{{\tt ip rule} --- routing policy database management} -\label{IP-RULE} - -\paragraph{Abbreviations:} \verb|rule|, \verb|ru|. - -\paragraph{Object:} \verb|rule|s in the routing policy database control -the route selection algorithm. - -Classic routing algorithms used in the Internet make routing decisions -based only on the destination address of packets (and in theory, -but not in practice, on the TOS field). The seminal review of classic -routing algorithms and their modifications can be found in~\cite{RFC1812}. - -In some circumstances we want to route packets differently depending not only -on destination addresses, but also on other packet fields: source address, -IP protocol, transport protocol ports or even packet payload. -This task is called ``policy routing''. - -\begin{NB} - ``policy routing'' $\neq$ ``routing policy''. - -\noindent ``policy routing'' $=$ ``cunning routing''. - -\noindent ``routing policy'' $=$ ``routing tactics'' or ``routing plan''. -\end{NB} - -To solve this task, the conventional destination based routing table, ordered -according to the longest match rule, is replaced with a ``routing policy -database'' (or RPDB), which selects routes -by executing some set of rules. The rules may have lots of keys of different -natures and therefore they have no natural ordering, but one imposed -by the administrator. Linux-2.2 RPDB is a linear list of rules -ordered by numeric priority value. -RPDB explicitly allows matching a few packet fields: - -\begin{itemize} -\item packet source address. -\item packet destination address. -\item TOS. -\item incoming interface (which is packet metadata, rather than a packet field). -\end{itemize} - -Matching IP protocols and transport ports is also possible, -indirectly, via \verb|ipchains|, by exploiting their ability -to mark some classes of packets with \verb|fwmark|. Therefore, -\verb|fwmark| is also included in the set of keys checked by rules. - -Each policy routing rule consists of a {\em selector\/} and an {\em action\/} -predicate. The RPDB is scanned in the order of increasing priority. The selector -of each rule is applied to \{source address, destination address, incoming -interface, tos, fwmark\} and, if the selector matches the packet, -the action is performed. The action predicate may return with success. -In this case, it will either give a route or failure indication -and the RPDB lookup is terminated. Otherwise, the RPDB program -continues on the next rule. - -What is the action, semantically? The natural action is to select the -nexthop and the output device. This is what -Cisco IOS~\cite{IOS} does. Let us call it ``match \& set''. -The Linux-2.2 approach is more flexible. The action includes -lookups in destination-based routing tables and selecting -a route from these tables according to the classic longest match algorithm. -The ``match \& set'' approach is the simplest case of the Linux one. It is realized -when a second level routing table contains a single default route. -Recall that Linux-2.2 supports multiple tables -managed with the \verb|ip route| command, described in the previous section. - -At startup time the kernel configures the default RPDB consisting of three -rules: - -\begin{enumerate} -\item Priority: 0, Selector: match anything, Action: lookup routing -table \verb|local| (ID 255). -The \verb|local| table is a special routing table containing -high priority control routes for local and broadcast addresses. - -\item Priority: 32766, Selector: match anything, Action: lookup routing -table \verb|main| (ID 254). -The \verb|main| table is the normal routing table containing all non-policy -routes. This rule may be deleted and/or overridden with other -ones by the administrator. - -\item Priority: 32767, Selector: match anything, Action: lookup routing -table \verb|default| (ID 253). -The \verb|default| table is empty. It is reserved for some -post-processing if no previous default rules selected the packet. -This rule may also be deleted. - -\end{enumerate} - -Do not confuse routing tables with rules: rules point to routing tables, -several rules may refer to one routing table and some routing tables -may have no rules pointing to them. If the administrator deletes all the rules -referring to a table, the table is not used, but it still exists -and will disappear only after all the routes contained in it are deleted. - - -\paragraph{Rule attributes:} Each RPDB entry has additional -attributes. F.e.\ each rule has a pointer to some routing -table. NAT and masquerading rules have an attribute to select new IP -address to translate/masquerade. Besides that, rules have some -optional attributes, which routes have, namely \verb|realms|. -These values do not override those contained in the routing tables. They -are only used if the route did not select any attributes. - - -\paragraph{Rule types:} The RPDB may contain rules of the following -types: -\begin{itemize} -\item \verb|unicast| --- the rule prescribes to return the route found -in the routing table referenced by the rule. -\item \verb|blackhole| --- the rule prescribes to silently drop the packet. -\item \verb|unreachable| --- the rule prescribes to generate a ``Network -is unreachable'' error. -\item \verb|prohibit| --- the rule prescribes to generate -``Communication is administratively prohibited'' error. -\item \verb|nat| --- the rule prescribes to translate the source address -of the IP packet into some other value. More about NAT is -in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}. -\end{itemize} - - -\paragraph{Commands:} \verb|add|, \verb|delete| and \verb|show| -(or \verb|list|). - -\subsection{{\tt ip rule add} --- insert a new rule\\ - {\tt ip rule delete} --- delete a rule} -\label{IP-RULE-ADD} - -\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|, - \verb|d|. - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|type TYPE| (default) - ---- the type of this rule. The list of valid types was given in the previous -subsection. - -\item \verb|from PREFIX| - ---- select the source prefix to match. - -\item \verb|to PREFIX| - ---- select the destination prefix to match. - -\item \verb|iif NAME| - ---- select the incoming device to match. If the interface is loopback, -the rule only matches packets originating from this host. This means that you -may create separate routing tables for forwarded and local packets and, -hence, completely segregate them. - -\item \verb|tos TOS| or \verb|dsfield TOS| - ---- select the TOS value to match. - -\item \verb|fwmark MARK| - ---- select the \verb|fwmark| value to match. - -\item \verb|priority PREFERENCE| - ---- the priority of this rule. Each rule should have an explicitly -set {\em unique\/} priority value. -\begin{NB} - Really, for historical reasons \verb|ip rule add| does not require a - priority value and allows them to be non-unique. - If the user does not supplied a priority, it is selected by the kernel. - If the user creates a rule with a priority value that - already exists, the kernel does not reject the request. It adds - the new rule before all old rules of the same priority. - - It is mistake in design, no more. And it will be fixed one day, - so do not rely on this feature. Use explicit priorities. -\end{NB} - - -\item \verb|table TABLEID| - ---- the routing table identifier to lookup if the rule selector matches. - -\item \verb|realms FROM/TO| - ---- Realms to select if the rule matched and the routing table lookup -succeeded. Realm \verb|TO| is only used if the route did not select -any realm. - -\item \verb|nat ADDRESS| - ---- The base of the IP address block to translate (for source addresses). -The \verb|ADDRESS| may be either the start of the block of NAT addresses -(selected by NAT routes) or in linux-2.2 a local host address (or even zero). -In the last case the router does not translate the packets, -but masquerades them to this address; this feature disappered in 2.4. -More about NAT is in Appendix~\ref{ROUTE-NAT}, -p.\pageref{ROUTE-NAT}. - -\end{itemize} - -\paragraph{Warning:} Changes to the RPDB made with these commands -do not become active immediately. It is assumed that after -a script finishes a batch of updates, it flushes the routing cache -with \verb|ip route flush cache|. - -\paragraph{Examples:} -\begin{itemize} -\item Route packets with source addresses from 192.203.80/24 -according to routing table \verb|inr.ruhep|: -\begin{verbatim} -ip ru add from 192.203.80.0/24 table inr.ruhep prio 220 -\end{verbatim} - -\item Translate packet source address 193.233.7.83 into 192.203.80.144 -and route it according to table \#1 (actually, it is \verb|inr.ruhep|): -\begin{verbatim} -ip ru add from 193.233.7.83 nat 192.203.80.144 table 1 prio 320 -\end{verbatim} - -\item Delete the unused default rule: -\begin{verbatim} -ip ru del prio 32767 -\end{verbatim} - -\end{itemize} - - - -\subsection{{\tt ip rule show} --- list rules} -\label{IP-RULE-SHOW} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. - - -\paragraph{Arguments:} Good news, this is one command that has no arguments. - -\paragraph{Output format:} - -\begin{verbatim} -kuznet@amber:~ $ ip ru ls -0: from all lookup local -200: from 192.203.80.0/24 to 193.233.7.0/24 lookup main -210: from 192.203.80.0/24 to 192.203.80.0/24 lookup main -220: from 192.203.80.0/24 lookup inr.ruhep realms inr.ruhep/radio-msu -300: from 193.233.7.83 to 193.233.7.0/24 lookup main -310: from 193.233.7.83 to 192.203.80.0/24 lookup main -320: from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144 -32766: from all lookup main -kuznet@amber:~ $ -\end{verbatim} - -In the first column is the rule priority value followed -by a colon. Then the selectors follow. Each key is prefixed -with the same keyword that was used to create the rule. - -The keyword \verb|lookup| is followed by a routing table identifier, -as it is recorded in the file \verb|/etc/iproute2/rt_tables|. - -If the rule does NAT (f.e.\ rule \#320), it is shown by the keyword -\verb|map-to| followed by the start of the block of addresses to map. - -The sense of this example is pretty simple. The prefixes -192.203.80.0/24 and 193.233.7.0/24 form the internal network, but -they are routed differently when the packets leave it. -Besides that, the host 193.233.7.83 is translated into -another prefix to look like 192.203.80.144 when talking -to the outer world. - -\subsection{{\tt ip rule save} -- save rules tables} -\label{IP-RULE-SAVE} - -\paragraph{Description:} this command saves the contents of the rules -tables or the rule(s) selected by some criteria to standard output. - -\paragraph{Arguments:} \verb|ip rule save| has the same arguments as -\verb|ip rule show|. - -\paragraph{Example:} This saves all the rules to the {\tt saved\_rules} -file: -\begin{verbatim} -dan@caffeine:~ # ip rule save > saved_rules -\end{verbatim} - -\paragraph{Output format:} The format of the data stream provided by -\verb|ip rule save| is that of \verb|rtnetlink|. See -\verb|rtnetlink(7)| for more information. - -\subsection{{\tt ip rule restore} -- restore rules tables} -\label{IP-RULE-RESTORE} - -\paragraph{Description:} this command restores the contents of the rules -tables according to a data stream as provided by \verb|ip rule save| via -standard input. Note that any rules already in the table are left unchanged, -and duplicates are not ignored. - -\paragraph{Arguments:} This command takes no arguments. - -\paragraph{Example:} This restores all rules that were saved to the -{\tt saved\_rules} file: - -\begin{verbatim} -dan@caffeine:~ # ip rule restore < saved_rules -\end{verbatim} - - - -\section{{\tt ip maddress} --- multicast addresses management} -\label{IP-MADDR} - -\paragraph{Object:} \verb|maddress| objects are multicast addresses. - -\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|show| (or \verb|list|). - -\subsection{{\tt ip maddress show} --- list multicast addresses} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. - -\paragraph{Arguments:} - -\begin{itemize} - -\item \verb|dev NAME| (default) - ---- the device name. - -\end{itemize} - -\paragraph{Output format:} - -\begin{verbatim} -kuznet@alisa:~ $ ip maddr ls dummy -2: dummy - link 33:33:00:00:00:01 - link 01:00:5e:00:00:01 - inet 224.0.0.1 users 2 - inet6 ff02::1 -kuznet@alisa:~ $ -\end{verbatim} - -The first line of the output shows the interface index and its name. -Then the multicast address list follows. Each line starts with the -protocol identifier. The word \verb|link| denotes a link layer -multicast addresses. - -If a multicast address has more than one user, the number -of users is shown after the \verb|users| keyword. - -One additional feature not present in the example above -is the \verb|static| flag, which indicates that the address was joined -with \verb|ip maddr add|. See the following subsection. - - - -\subsection{{\tt ip maddress add} --- add a multicast address\\ - {\tt ip maddress delete} --- delete a multicast address} - -\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|, \verb|d|. - -\paragraph{Description:} these commands attach/detach -a static link layer multicast address to listen on the interface. -Note that it is impossible to join protocol multicast groups -statically. This command only manages link layer addresses. - - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|address LLADDRESS| (default) - ---- the link layer multicast address. - -\item \verb|dev NAME| - ---- the device to join/leave this multicast address. - -\end{itemize} - - -\paragraph{Example:} Let us continue with the example from the previous subsection. - -\begin{verbatim} -netadm@alisa:~ # ip maddr add 33:33:00:00:00:01 dev dummy -netadm@alisa:~ # ip -0 maddr ls dummy -2: dummy - link 33:33:00:00:00:01 users 2 static - link 01:00:5e:00:00:01 -netadm@alisa:~ # ip maddr del 33:33:00:00:00:01 dev dummy -\end{verbatim} - -\begin{NB} - Neither \verb|ip| nor the kernel check for multicast address validity. - Particularly, this means that you can try to load a unicast address - instead of a multicast address. Most drivers will ignore such addresses, - but several (f.e.\ Tulip) will intern it to their on-board filter. - The effects may be strange. Namely, the addresses become additional - local link addresses and, if you loaded the address of another host - to the router, wait for duplicated packets on the wire. - It is not a bug, but rather a hole in the API and intra-kernel interfaces. - This feature is really more useful for traffic monitoring, but using it - with Linux-2.2 you {\em have to\/} be sure that the host is not - a router and, especially, that it is not a transparent proxy or masquerading - agent. -\end{NB} - - - -\section{{\tt ip mroute} --- multicast routing cache management} -\label{IP-MROUTE} - -\paragraph{Abbreviations:} \verb|mroute|, \verb|mr|. - -\paragraph{Object:} \verb|mroute| objects are multicast routing cache -entries created by a user level mrouting daemon -(f.e.\ \verb|pimd| or \verb|mrouted|). - -Due to the limitations of the current interface to the multicast routing -engine, it is impossible to change \verb|mroute| objects administratively, -so we may only display them. This limitation will be removed -in the future. - -\paragraph{Commands:} \verb|show| (or \verb|list|). - - -\subsection{{\tt ip mroute show} --- list mroute cache entries} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. - -\paragraph{Arguments:} - -\begin{itemize} -\item \verb|to PREFIX| (default) - ---- the prefix selecting the destination multicast addresses to list. - - -\item \verb|iif NAME| - ---- the interface on which multicast packets are received. - - -\item \verb|from PREFIX| - ---- the prefix selecting the IP source addresses of the multicast route. - - -\end{itemize} - -\paragraph{Output format:} - -\begin{verbatim} -kuznet@amber:~ $ ip mroute ls -(193.232.127.6, 224.0.1.39) Iif: unresolved -(193.232.244.34, 224.0.1.40) Iif: unresolved -(193.233.7.65, 224.66.66.66) Iif: eth0 Oifs: pimreg -kuznet@amber:~ $ -\end{verbatim} - -Each line shows one (S,G) entry in the multicast routing cache, -where S is the source address and G is the multicast group. \verb|Iif| is -the interface on which multicast packets are expected to arrive. -If the word \verb|unresolved| is there instead of the interface name, -it means that the routing daemon still hasn't resolved this entry. -The keyword \verb|oifs| is followed by a list of output interfaces, separated -by spaces. If a multicast routing entry is created with non-trivial -TTL scope, administrative distances are appended to the device names -in the \verb|oifs| list. - -\paragraph{Statistics:} The \verb|-statistics| option also prints the -number of packets and bytes forwarded along this route and -the number of packets that arrived on the wrong interface, if this number is not zero. - -\begin{verbatim} -kuznet@amber:~ $ ip -s mr ls 224.66/16 -(193.233.7.65, 224.66.66.66) Iif: eth0 Oifs: pimreg - 9383 packets, 300256 bytes -kuznet@amber:~ $ -\end{verbatim} - - -\section{{\tt ip tunnel} --- tunnel configuration} -\label{IP-TUNNEL} - -\paragraph{Abbreviations:} \verb|tunnel|, \verb|tunl|. - -\paragraph{Object:} \verb|tunnel| objects are tunnels, encapsulating -packets in IPv4 packets and then sending them over the IP infrastructure. - -\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|change|, \verb|show| -(or \verb|list|). - -\paragraph{See also:} A more informal discussion of tunneling -over IP and the \verb|ip tunnel| command can be found in~\cite{IP-TUNNELS}. - -\subsection{{\tt ip tunnel add} --- add a new tunnel\\ - {\tt ip tunnel change} --- change an existing tunnel\\ - {\tt ip tunnel delete} --- destroy a tunnel} - -\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|; -\verb|delete|, \verb|del|, \verb|d|. - - -\paragraph{Arguments:} - -\begin{itemize} - -\item \verb|name NAME| (default) - ---- select the tunnel device name. - -\item \verb|mode MODE| - ---- set the tunnel mode. Three modes are currently available: - \verb|ipip|, \verb|sit| and \verb|gre|. - -\item \verb|remote ADDRESS| - ---- set the remote endpoint of the tunnel. - -\item \verb|local ADDRESS| - ---- set the fixed local address for tunneled packets. -It must be an address on another interface of this host. - -\item \verb|ttl N| - ---- set a fixed TTL \verb|N| on tunneled packets. - \verb|N| is a number in the range 1--255. 0 is a special value - meaning that packets inherit the TTL value. - The default value is: \verb|inherit|. - -\item \verb|tos T| or \verb|dsfield T| - ---- set a fixed TOS \verb|T| on tunneled packets. - The default value is: \verb|inherit|. - - - -\item \verb|dev NAME| - ---- bind the tunnel to the device \verb|NAME| so that - tunneled packets will only be routed via this device and will - not be able to escape to another device when the route to endpoint changes. - -\item \verb|nopmtudisc| - ---- disable Path MTU Discovery on this tunnel. - It is enabled by default. Note that a fixed ttl is incompatible - with this option: tunnelling with a fixed ttl always makes pmtu discovery. - -\item \verb|ignore-df| - ---- (only GRE tunnels) enable IPv4 DF flag suppression on this tunnel. - If is disabled by default. Enabling this option will cause IPv4 - payloads to be handled like any other GRE payload, - regardless of the DF flag. - -\item \verb|key K|, \verb|ikey K|, \verb|okey K| - ---- (only GRE tunnels) use keyed GRE with key \verb|K|. \verb|K| is - either a number or an IP address-like dotted quad. - The \verb|key| parameter sets the key to use in both directions. - The \verb|ikey| and \verb|okey| parameters set different keys for input and output. - - -\item \verb|csum|, \verb|icsum|, \verb|ocsum| - ---- (only GRE tunnels) generate/require checksums for tunneled packets. - The \verb|ocsum| flag calculates checksums for outgoing packets. - The \verb|icsum| flag requires that all input packets have the correct - checksum. The \verb|csum| flag is equivalent to the combination - ``\verb|icsum| \verb|ocsum|''. - -\item \verb|seq|, \verb|iseq|, \verb|oseq| - ---- (only GRE tunnels) serialize packets. - The \verb|oseq| flag enables sequencing of outgoing packets. - The \verb|iseq| flag requires that all input packets are serialized. - The \verb|seq| flag is equivalent to the combination ``\verb|iseq| \verb|oseq|''. - -\begin{NB} - I think this option does not - work. At least, I did not test it, did not debug it and - do not even understand how it is supposed to work or for what - purpose Cisco planned to use it. Do not use it. -\end{NB} - - -\end{itemize} - -\paragraph{Example:} Create a pointopoint IPv6 tunnel with maximal TTL of 32. -\begin{verbatim} -netadm@amber:~ # ip tunl add Cisco mode sit remote 192.31.7.104 \ - local 192.203.80.142 ttl 32 -\end{verbatim} - -\subsection{{\tt ip tunnel show} --- list tunnels} - -\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. - - -\paragraph{Arguments:} None. - -\paragraph{Output format:} -\begin{verbatim} -kuznet@amber:~ $ ip tunl ls Cisco -Cisco: ipv6/ip remote 192.31.7.104 local 192.203.80.142 ttl 32 -kuznet@amber:~ $ -\end{verbatim} -The line starts with the tunnel device name followed by a colon. -Then the tunnel mode follows. The parameters of the tunnel are listed -with the same keywords that were used when creating the tunnel. - -\paragraph{Statistics:} - -\begin{verbatim} -kuznet@amber:~ $ ip -s tunl ls Cisco -Cisco: ipv6/ip remote 192.31.7.104 local 192.203.80.142 ttl 32 -RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts - 12566 1707516 0 0 0 0 -TX: Packets Bytes Errors DeadLoop NoRoute NoBufs - 13445 1879677 0 0 0 0 -kuznet@amber:~ $ -\end{verbatim} -Essentially, these numbers are the same as the numbers -printed with {\tt ip -s link show} -(sec.\ref{IP-LINK-SHOW}, p.\pageref{IP-LINK-SHOW}) but the tags are different -to reflect that they are tunnel specific. -\begin{itemize} -\item \verb|CsumErrs| --- the total number of packets dropped -because of checksum failures for a GRE tunnel with checksumming enabled. -\item \verb|OutOfSeq| --- the total number of packets dropped -because they arrived out of sequence for a GRE tunnel with -serialization enabled. -\item \verb|Mcasts| --- the total number of multicast packets -received on a broadcast GRE tunnel. -\item \verb|DeadLoop| --- the total number of packets which were not -transmitted because the tunnel is looped back to itself. -\item \verb|NoRoute| --- the total number of packets which were not -transmitted because there is no IP route to the remote endpoint. -\item \verb|NoBufs| --- the total number of packets which were not -transmitted because the kernel failed to allocate a buffer. -\end{itemize} - - -\section{{\tt ip monitor} and {\tt rtmon} --- state monitoring} -\label{IP-MONITOR} - -The \verb|ip| utility can monitor the state of devices, addresses -and routes continuously. This option has a slightly different format. -Namely, -the \verb|monitor| command is the first in the command line and then -the object list follows: -\begin{verbatim} - ip monitor [ file FILE ] [ all | OBJECT-LIST ] [ label ] -\end{verbatim} -\verb|OBJECT-LIST| is the list of object types that we want to -monitor. It may contain \verb|link|, \verb|address| and \verb|route|. -Specifying \verb|label| indicates that output lines should be labelled -with the type of object being printed --- this happens by default if -\verb|all| is specified. If no \verb|file| argument is given, -\verb|ip| opens RTNETLINK, listens on it and dumps state changes in -the format described in previous sections. - -If a file name is given, it does not listen on RTNETLINK, -but opens the file containing RTNETLINK messages saved in binary format -and dumps them. Such a history file can be generated with the -\verb|rtmon| utility. This utility has a command line syntax similar to -\verb|ip monitor|. -Ideally, \verb|rtmon| should be started before -the first network configuration command is issued. F.e.\ if -you insert: -\begin{verbatim} - rtmon file /var/log/rtmon.log -\end{verbatim} -in a startup script, you will be able to view the full history -later. - -Certainly, it is possible to start \verb|rtmon| at any time. -It prepends the history with the state snapshot dumped at the moment -of starting. - - -\section{Route realms and policy propagation, {\tt rtacct}} -\label{RT-REALMS} - -On routers using OSPF ASE or, especially, the BGP protocol, routing -tables may be huge. If we want to classify or to account for the packets -per route, we will have to keep lots of information. Even worse, if we -want to distinguish the packets not only by their destination, but -also by their source, the task gets quadratic complexity and its solution -is physically impossible. - -One approach to propagating the policy from routing protocols -to the forwarding engine has been proposed in~\cite{IOS-BGP-PP}. -Essentially, Cisco Policy Propagation via BGP is based on the fact -that dedicated routers all have the RIB (Routing Information Base) -close to the forwarding engine, so policy routing rules can -check all the route attributes, including ASPATH information -and community strings. - -The Linux architecture, splitting the RIB (maintained by a user level -daemon) and the kernel based FIB (Forwarding Information Base), -does not allow such a simple approach. - -It is to our fortune because there is another solution -which allows even more flexible policy and richer semantics. - -Namely, routes can be clustered together in user space, based on their -attributes. F.e.\ a BGP router knows route ASPATH, its community; -an OSPF router knows the route tag or its area. The administrator, when adding -routes manually, also knows their nature. Providing that the number of such -aggregates (we call them {\em realms\/}) is low, the task of full -classification both by source and destination becomes quite manageable. - -So each route may be assigned to a realm. It is assumed that -this identification is made by a routing daemon, but static routes -can also be handled manually with \verb|ip route| (see sec.\ref{IP-ROUTE}, -p.\pageref{IP-ROUTE}). -\begin{NB} - There is a patch to \verb|gated|, allowing classification of routes - to realms with all the set of policy rules implemented in \verb|gated|: - by prefix, by ASPATH, by origin, by tag etc. -\end{NB} - -To facilitate the construction (f.e.\ in case the routing -daemon is not aware of realms), missing realms may be completed -with routing policy rules, see sec.~\ref{IP-RULE}, p.\pageref{IP-RULE}. - -For each packet the kernel calculates a tuple of realms: source realm -and destination realm, using the following algorithm: - -\begin{enumerate} -\item If the route has a realm, the destination realm of the packet is set to it. -\item If the rule has a source realm, the source realm of the packet is set to it. -If the destination realm was not inherited from the route and the rule has a destination realm, -it is also set. -\item If at least one of the realms is still unknown, the kernel finds -the reversed route to the source of the packet. -\item If the source realm is still unknown, get it from the reversed route. -\item If one of the realms is still unknown, swap the realms of reversed -routes and apply step 2 again. -\end{enumerate} - -After this procedure is completed we know what realm the packet -arrived from and the realm where it is going to propagate to. -If some of the realms are unknown, they are initialized to zero -(or realm \verb|unknown|). - -The main application of realms is the TC \verb|route| classifier~\cite{TC-CREF}, -where they are used to help assign packets to traffic classes, -to account, police and schedule them according to this -classification. - -A much simpler but still very useful application is incoming packet -accounting by realms. The kernel gathers a packet statistics summary -which can be viewed with the \verb|rtacct| utility. -\begin{verbatim} -kuznet@amber:~ $ rtacct russia -Realm BytesTo PktsTo BytesFrom PktsFrom -russia 20576778 169176 47080168 153805 -kuznet@amber:~ $ -\end{verbatim} -This shows that this router received 153805 packets from -the realm \verb|russia| and forwarded 169176 packets to \verb|russia|. -The realm \verb|russia| consists of routes with ASPATHs not leaving -Russia. - -Note that locally originating packets are not accounted here, -\verb|rtacct| shows incoming packets only. Using the \verb|route| -classifier (see~\cite{TC-CREF}) you can get even more detailed -accounting information about outgoing packets, optionally -summarizing traffic not only by source or destination, but -by any pair of source and destination realms. - - -\begin{thebibliography}{99} -\addcontentsline{toc}{section}{References} -\bibitem{RFC-NDISC} T.~Narten, E.~Nordmark, W.~Simpson. -``Neighbor Discovery for IP Version 6 (IPv6)'', RFC-2461. - -\bibitem{RFC-ADDRCONF} S.~Thomson, T.~Narten. -``IPv6 Stateless Address Autoconfiguration'', RFC-2462. - -\bibitem{RFC1812} F.~Baker. -``Requirements for IP Version 4 Routers'', RFC-1812. - -\bibitem{RFC1122} R.~T.~Braden. -``Requirements for Internet hosts --- communication layers'', RFC-1122. - -\bibitem{IOS} ``Cisco IOS Release 12.0 Network Protocols -Command Reference, Part 1'' and -``Cisco IOS Release 12.0 Quality of Service Solutions -Configuration Guide: Configuring Policy-Based Routing'',\\ -http://www.cisco.com/univercd/cc/td/doc/product/software/ios120. - -\bibitem{IP-TUNNELS} A.~N.~Kuznetsov. -``Tunnels over IP in Linux-2.2'', \\ -In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}. - -\bibitem{TC-CREF} A.~N.~Kuznetsov. ``TC Command Reference'',\\ -In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}. - -\bibitem{IOS-BGP-PP} ``Cisco IOS Release 12.0 Quality of Service Solutions -Configuration Guide: Configuring QoS Policy Propagation via -Border Gateway Protocol'',\\ -http://www.cisco.com/univercd/cc/td/doc/product/software/ios120. - -\bibitem{RFC-DHCP} R.~Droms. -``Dynamic Host Configuration Protocol.'', RFC-2131 - -\bibitem{RFC2414} M.~Allman, S.~Floyd, C.~Partridge. -``Increasing TCP's Initial Window'', RFC-2414. - -\end{thebibliography} - - - - -\appendix -\addcontentsline{toc}{section}{Appendix} - -\section{Source address selection} -\label{ADDR-SEL} - -When a host creates an IP packet, it must select some source -address. Correct source address selection is a critical procedure, -because it gives the receiver the information needed to deliver a -reply. If the source is selected incorrectly, in the best case, -the backward path may appear different to the forward one which -is harmful for performance. In the worst case, when the addresses -are administratively scoped, the reply may be lost entirely. - -Linux-2.2 selects source addresses using the following algorithm: - -\begin{itemize} -\item -The application may select a source address explicitly with \verb|bind(2)| -syscall or supplying it to \verb|sendmsg(2)| via the ancillary data object -\verb|IP_PKTINFO|. In this case the kernel only checks the validity -of the address and never tries to ``improve'' an incorrect user choice, -generating an error instead. -\begin{NB} - Never say ``Never''. The sysctl option \verb|ip_dynaddr| breaks - this axiom. It has been made deliberately with the purpose - of automatically reselecting the address on hosts with dynamic dial-out interfaces. - However, this hack {\em must not\/} be used on multihomed hosts - and especially on routers: it would break them. -\end{NB} - - -\item Otherwise, IP routing tables can contain an explicit source -address hint for this destination. The hint is set with the \verb|src| parameter -to the \verb|ip route| command, sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}. - - -\item Otherwise, the kernel searches through the list of addresses -attached to the interface through which the packets will be routed. -The search strategies are different for IP and IPv6. Namely: - -\begin{itemize} -\item IPv6 searches for the first valid, not deprecated address -with the same scope as the destination. - -\item IP searches for the first valid address with a scope wider -than the scope of the destination but it prefers addresses -which fall to the same subnet as the nexthop of the route -to the destination. Unlike IPv6, the scopes of IPv4 destinations -are not encoded in their addresses but are supplied -in routing tables instead (the \verb|scope| parameter to the \verb|ip route| command, -sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}). - -\end{itemize} - - -\item Otherwise, if the scope of the destination is \verb|link| or \verb|host|, -the algorithm fails and returns a zero source address. - -\item Otherwise, all interfaces are scanned to search for an address -with an appropriate scope. The loopback device \verb|lo| is always the first -in the search list, so that if an address with global scope (not 127.0.0.1!) -is configured on loopback, it is always preferred. - -\end{itemize} - - -\section{Proxy ARP/NDISC} -\label{PROXY-NEIGH} - -Routers may answer ARP/NDISC solicitations on behalf of other hosts. -In Linux-2.2 proxy ARP on an interface may be enabled -by setting the kernel \verb|sysctl| variable -\verb|/proc/sys/net/ipv4/conf/<dev>/proxy_arp| to 1. After this, the router -starts to answer ARP requests on the interface \verb|<dev>|, provided -the route to the requested destination does {\em not\/} go back via the same -device. - -The variable \verb|/proc/sys/net/ipv4/conf/all/proxy_arp| enables proxy -ARP on all the IP devices. - -However, this approach fails in the case of IPv6 because the router -must join the solicited node multicast address to listen for the corresponding -NDISC queries. It means that proxy NDISC is possible only on a per destination -basis. - -Logically, proxy ARP/NDISC is not a kernel task. It can easily be implemented -in user space. However, similar functionality was present in BSD kernels -and in Linux-2.0, so we have to preserve it at least to the extent that -is standardized in BSD. -\begin{NB} - Linux-2.0 ARP had a feature called {\em subnet\/} proxy ARP. - It is replaced with the sysctl flag in Linux-2.2. -\end{NB} - - -The \verb|ip| utility provides a way to manage proxy ARP/NDISC -with the \verb|ip neigh| command, namely: -\begin{verbatim} - ip neigh add proxy ADDRESS [ dev NAME ] -\end{verbatim} -adds a new proxy ARP/NDISC record and -\begin{verbatim} - ip neigh del proxy ADDRESS [ dev NAME ] -\end{verbatim} -deletes it. - -If the name of the device is not given, the router will answer solicitations -for address \verb|ADDRESS| on all devices, otherwise it will only serve -the device \verb|NAME|. Even if the proxy entry is created with -\verb|ip neigh|, the router {\em will not\/} answer a query if the route -to the destination goes back via the interface from which the solicitation -was received. - -It is important to emphasize that proxy entries have {\em no\/} -parameters other than these (IP/IPv6 address and optional device). -Particularly, the entry does not store any link layer address. -It always advertises the station address of the interface -on which it sends advertisements (i.e. it's own station address). - -\section{Route NAT status} -\label{ROUTE-NAT} - -NAT (or ``Network Address Translation'') remaps some parts -of the IP address space into other ones. Linux-2.2 route NAT is supposed -to be used to facilitate policy routing by rewriting addresses -to other routing domains or to help while renumbering sites -to another prefix. - -\paragraph{What it is not:} -It is necessary to emphasize that {\em it is not supposed\/} -to be used to compress address space or to split load. -This is not missing functionality but a design principle. -Route NAT is {\em stateless\/}. It does not hold any state -about translated sessions. This means that it handles any number -of sessions flawlessly. But it also means that it is {\em static\/}. -It cannot detect the moment when the last TCP client stops -using an address. For the same reason, it will not help to split -load between several servers. -\begin{NB} -It is a pretty commonly held belief that it is useful to split load between -several servers with NAT. This is a mistake. All you get from this -is the requirement that the router keep the state of all the TCP connections -going via it. Well, if the router is so powerful, run apache on it. 8) -\end{NB} - -The second feature: it does not touch packet payload, -does not try to ``improve'' broken protocols by looking -through its data and mangling it. It mangles IP addresses, -only IP addresses and nothing but IP addresses. -This also, is not missing any functionality. - -To resume: if you need to compress address space or keep -active FTP clients happy, your choice is not route NAT but masquerading, -port forwarding, NAPT etc. -\begin{NB} -By the way, you may also want to look at -http://www.suse.com/\~mha/HyperNews/get/linux-ip-nat.html -\end{NB} - - -\paragraph{How it works.} -Some part of the address space is reserved for dummy addresses -which will look for all the world like some host addresses -inside your network. No other hosts may use these addresses, -however other routers may also be configured to translate them. -\begin{NB} -A great advantage of route NAT is that it may be used not -only in stub networks but in environments with arbitrarily complicated -structure. It does not firewall, it {\em forwards.} -\end{NB} -These addresses are selected by the \verb|ip route| command -(sec.\ref{IP-ROUTE-ADD}, p.\pageref{IP-ROUTE-ADD}). F.e.\ -\begin{verbatim} - ip route add nat 192.203.80.144 via 193.233.7.83 -\end{verbatim} -states that the single address 192.203.80.144 is a dummy NAT address. -For all the world it looks like a host address inside our network. -For neighbouring hosts and routers it looks like the local address -of the translating router. The router answers ARP for it, advertises -this address as routed via it, {\em et al\/}. When the router -receives a packet destined for 192.203.80.144, it replaces -this address with 193.233.7.83 which is the address of some real -host and forwards the packet. If you need to remap -blocks of addresses, you may use a command like: -\begin{verbatim} - ip route add nat 192.203.80.192/26 via 193.233.7.64 -\end{verbatim} -This command will map a block of 63 addresses 192.203.80.192-255 to -193.233.7.64-127. - -When an internal host (193.233.7.83 in the example above) -sends something to the outer world and these packets are forwarded -by our router, it should translate the source address 193.233.7.83 -into 192.203.80.144. This task is solved by setting a special -policy rule (sec.\ref{IP-RULE-ADD}, p.\pageref{IP-RULE-ADD}): -\begin{verbatim} - ip rule add prio 320 from 193.233.7.83 nat 192.203.80.144 -\end{verbatim} -This rule says that the source address 193.233.7.83 -should be translated into 192.203.80.144 before forwarding. -It is important that the address after the \verb|nat| keyword -is some NAT address, declared by {\tt ip route add nat}. -If it is just a random address the router will not map to it. -\begin{NB} -The exception is when the address is a local address of this -router (or 0.0.0.0) and masquerading is configured in the linux-2.2 -kernel. In this case the router will masquerade the packets as this address. -If 0.0.0.0 is selected, the result is equivalent to one -obtained with firewalling rules. Otherwise, you have the way -to order Linux to masquerade to this fixed address. -NAT mechanism used in linux-2.4 is more flexible than -masquerading, so that this feature has lost meaning and disabled. -\end{NB} - -If the network has non-trivial internal structure, it is -useful and even necessary to add rules disabling translation -when a packet does not leave this network. Let us return to the -example from sec.\ref{IP-RULE-SHOW} (p.\pageref{IP-RULE-SHOW}). -\begin{verbatim} -300: from 193.233.7.83 to 193.233.7.0/24 lookup main -310: from 193.233.7.83 to 192.203.80.0/24 lookup main -320: from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144 -\end{verbatim} -This block of rules causes normal forwarding when -packets from 193.233.7.83 do not leave networks 193.233.7/24 -and 192.203.80/24. Also, if the \verb|inr.ruhep| table does not -contain a route to the destination (which means that the routing -domain owning addresses from 192.203.80/24 is dead), no translation -will occur. Otherwise, the packets are translated. - -\paragraph{How to only translate selected ports:} -If you only want to translate selected ports (f.e.\ http) -and leave the rest intact, you may use \verb|ipchains| -to \verb|fwmark| a class of packets. -Suppose you did and all the packets from 193.233.7.83 -destined for port 80 are marked with marker 0x1234 in input fwchain. -In this case you may replace rule \#320 with: -\begin{verbatim} -320: from 193.233.7.83 fwmark 1234 lookup main map-to 192.203.80.144 -\end{verbatim} -and translation will only be enabled for outgoing http requests. - -\section{Example: minimal host setup} -\label{EXAMPLE-SETUP} - -The following script gives an example of a fault safe -setup of IP (and IPv6, if it is compiled into the kernel) -in the common case of a node attached to a single broadcast -network. A more advanced script, which may be used both on multihomed -hosts and on routers, is described in the following -section. - -The utilities used in the script may be found in the -directory ftp://ftp.inr.ac.ru/ip-routing/: -\begin{enumerate} -\item \verb|ip| --- package \verb|iproute2|. -\item \verb|arping| --- package \verb|iputils|. -\item \verb|rdisc| --- package \verb|iputils|. -\end{enumerate} -\begin{NB} -It also refers to a DHCP client, \verb|dhcpcd|. I should refrain from -recommending a good DHCP client to use. All that I can -say is that ISC \verb|dhcp-2.0b1pl6| patched with the patch that -can be found in the \verb|dhcp.bootp.rarp| subdirectory of -the same ftp site {\em does\/} work, -at least on Ethernet and Token Ring. -\end{NB} - -\begin{verbatim} -#! /bin/bash -\end{verbatim} -\begin{flushleft} -\# {\bf Usage: \verb|ifone ADDRESS[/PREFIX-LENGTH] [DEVICE]|}\\ -\# {\bf Parameters:}\\ -\# \$1 --- Static IP address, optionally followed by prefix length.\\ -\# \$2 --- Device name. If it is missing, \verb|eth0| is asssumed.\\ -\# F.e. \verb|ifone 193.233.7.90| -\end{flushleft} -\begin{verbatim} -dev=$2 -: ${dev:=eth0} -ipaddr= -\end{verbatim} -\# Parse IP address, splitting prefix length. -\begin{verbatim} -if [ "$1" != "" ]; then - ipaddr=${1%/*} - if [ "$1" != "$ipaddr" ]; then - pfxlen=${1#*/} - fi - : ${pfxlen:=24} -fi -pfx="${ipaddr}/${pfxlen}" -\end{verbatim} - -\begin{flushleft} -\# {\bf Step 0} --- enable loopback.\\ -\#\\ -\# This step is necessary on any networked box before attempt\\ -\# to configure any other device.\\ -\end{flushleft} -\begin{verbatim} -ip link set up dev lo -ip addr add 127.0.0.1/8 dev lo brd + scope host -\end{verbatim} -\begin{flushleft} -\# IPv6 autoconfigure themself on loopback.\\ -\#\\ -\# If user gave loopback as device, we add the address as alias and exit. -\end{flushleft} -\begin{verbatim} -if [ "$dev" = "lo" ]; then - if [ "$ipaddr" != "" -a "$ipaddr" != "127.0.0.1" ]; then - ip address add $ipaddr dev $dev - exit $? - fi - exit 0 -fi -\end{verbatim} - -\noindent\# {\bf Step 1} --- enable device \verb|$dev| - -\begin{verbatim} -if ! ip link set up dev $dev ; then - echo "Cannot enable interface $dev. Aborting." 1>&2 - exit 1 -fi -\end{verbatim} -\begin{flushleft} -\# The interface is \verb|UP|. IPv6 started stateless autoconfiguration itself,\\ -\# and its configuration finishes here. However,\\ -\# IP still needs some static preconfigured address. -\end{flushleft} -\begin{verbatim} -if [ "$ipaddr" = "" ]; then - echo "No address for $dev is configured, trying DHCP..." 1>&2 - dhcpcd - exit $? -fi -\end{verbatim} - -\begin{flushleft} -\# {\bf Step 2} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\ -\# Send two probes and wait for result for 3 seconds.\\ -\# If the interface opens slower f.e.\ due to long media detection,\\ -\# you want to increase the timeout.\\ -\end{flushleft} -\begin{verbatim} -if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then - echo "Address $ipaddr is busy, trying DHCP..." 1>&2 - dhcpcd - exit $? -fi -\end{verbatim} -\begin{flushleft} -\# OK, the address is unique, we may add it on the interface.\\ -\#\\ -\# {\bf Step 3} --- Configure the address on the interface. -\end{flushleft} - -\begin{verbatim} -if ! ip address add $pfx brd + dev $dev; then - echo "Failed to add $pfx on $dev, trying DHCP..." 1>&2 - dhcpcd - exit $? -fi -\end{verbatim} - -\noindent\# {\bf Step 4} --- Announce our presence on the link. -\begin{verbatim} -arping -A -c 1 -I $dev $ipaddr -noarp=$? -( sleep 2; - arping -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null & -\end{verbatim} - -\begin{flushleft} -\# {\bf Step 5} (optional) --- Add some control routes.\\ -\#\\ -\# 1. Prohibit link local multicast addresses.\\ -\# 2. Prohibit link local (alias, limited) broadcast.\\ -\# 3. Add default multicast route. -\end{flushleft} -\begin{verbatim} -ip route add unreachable 224.0.0.0/24 -ip route add unreachable 255.255.255.255 -if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then - ip route add 224.0.0.0/4 dev $dev scope global -fi -\end{verbatim} - -\begin{flushleft} -\# {\bf Step 6} --- Add fallback default route with huge metric.\\ -\# If a proxy ARP server is present on the interface, we will be\\ -\# able to talk to all the Internet without further configuration.\\ -\# It is not so cheap though and we still hope that this route\\ -\# will be overridden by more correct one by rdisc.\\ -\# Do not make this step if the device is not ARPable,\\ -\# because dead nexthop detection does not work on them. -\end{flushleft} -\begin{verbatim} -if [ "$noarp" = "0" ]; then - ip ro add default dev $dev metric 30000 scope global -fi -\end{verbatim} - -\begin{flushleft} -\# {\bf Step 7} --- Restart router discovery and exit. -\end{flushleft} -\begin{verbatim} -killall -HUP rdisc || rdisc -fs -exit 0 -\end{verbatim} - - -\section{Example: {\protect\tt ifcfg} --- interface address management} -\label{EXAMPLE-IFCFG} - -This is a simplistic script replacing one option of \verb|ifconfig|, -namely, IP address management. It not only adds -addresses, but also carries out Duplicate Address Detection~\cite{RFC-DHCP}, -sends unsolicited ARP to update the caches of other hosts sharing -the interface, adds some control routes and restarts Router Discovery -when it is necessary. - -I strongly recommend using it {\em instead\/} of \verb|ifconfig| both -on hosts and on routers. - -\begin{verbatim} -#! /bin/bash -\end{verbatim} -\begin{flushleft} -\# {\bf Usage: \verb?ifcfg DEVICE[:ALIAS] [add|del] ADDRESS[/LENGTH] [PEER]?}\\ -\# {\bf Parameters:}\\ -\# ---Device name. It may have alias suffix, separated by colon.\\ -\# ---Command: add, delete or stop.\\ -\# ---IP address, optionally followed by prefix length.\\ -\# ---Optional peer address for pointopoint interfaces.\\ -\# F.e. \verb|ifcfg eth0 193.233.7.90/24| - -\noindent\# This function determines, whether it is router or host.\\ -\# It returns 0, if the host is apparently not router. -\end{flushleft} -\begin{verbatim} -CheckForwarding () { - local sbase fwd - sbase=/proc/sys/net/ipv4/conf - fwd=0 - if [ -d $sbase ]; then - for dir in $sbase/*/forwarding; do - fwd=$[$fwd + `cat $dir`] - done - else - fwd=2 - fi - return $fwd -} -\end{verbatim} -\begin{flushleft} -\# This function restarts Router Discovery.\\ -\end{flushleft} -\begin{verbatim} -RestartRDISC () { - killall -HUP rdisc || rdisc -fs -} -\end{verbatim} -\begin{flushleft} -\# Calculate ABC "natural" mask length\\ -\# Arg: \$1 = dotquad address -\end{flushleft} -\begin{verbatim} -ABCMaskLen () { - local class; - class=${1%%.*} - if [ $class -eq 0 -o $class -ge 224 ]; then return 0 - elif [ $class -ge 192 ]; then return 24 - elif [ $class -ge 128 ]; then return 16 - else return 8 ; fi -} -\end{verbatim} - - -\begin{flushleft} -\# {\bf MAIN()}\\ -\#\\ -\# Strip alias suffix separated by colon. -\end{flushleft} -\begin{verbatim} -label="label $1" -ldev=$1 -dev=${1%:*} -if [ "$dev" = "" -o "$1" = "help" ]; then - echo "Usage: ifcfg DEV [[add|del [ADDR[/LEN]] [PEER] | stop]" 1>&2 - echo " add - add new address" 1>&2 - echo " del - delete address" 1>&2 - echo " stop - completely disable IP" 1>&2 - exit 1 -fi -shift - -CheckForwarding -fwd=$? -\end{verbatim} -\begin{flushleft} -\# Parse command. If it is ``stop'', flush and exit. -\end{flushleft} -\begin{verbatim} -deleting=0 -case "$1" in -add) shift ;; -stop) - if [ "$ldev" != "$dev" ]; then - echo "Cannot stop alias $ldev" 1>&2 - exit 1; - fi - ip -4 addr flush dev $dev $label || exit 1 - if [ $fwd -eq 0 ]; then RestartRDISC; fi - exit 0 ;; -del*) - deleting=1; shift ;; -*) -esac -\end{verbatim} -\begin{flushleft} -\# Parse prefix, split prefix length, separated by slash. -\end{flushleft} -\begin{verbatim} -ipaddr= -pfxlen= -if [ "$1" != "" ]; then - ipaddr=${1%/*} - if [ "$1" != "$ipaddr" ]; then - pfxlen=${1#*/} - fi - if [ "$ipaddr" = "" ]; then - echo "$1 is bad IP address." 1>&2 - exit 1 - fi -fi -shift -\end{verbatim} -\begin{flushleft} -\# If peer address is present, prefix length is 32.\\ -\# Otherwise, if prefix length was not given, guess it. -\end{flushleft} -\begin{verbatim} -peer=$1 -if [ "$peer" != "" ]; then - if [ "$pfxlen" != "" -a "$pfxlen" != "32" ]; then - echo "Peer address with non-trivial netmask." 1>&2 - exit 1 - fi - pfx="$ipaddr peer $peer" -else - if [ "$pfxlen" = "" ]; then - ABCMaskLen $ipaddr - pfxlen=$? - fi - pfx="$ipaddr/$pfxlen" -fi -if [ "$ldev" = "$dev" -a "$ipaddr" != "" ]; then - label= -fi -\end{verbatim} -\begin{flushleft} -\# If deletion was requested, delete the address and restart RDISC -\end{flushleft} -\begin{verbatim} -if [ $deleting -ne 0 ]; then - ip addr del $pfx dev $dev $label || exit 1 - if [ $fwd -eq 0 ]; then RestartRDISC; fi - exit 0 -fi -\end{verbatim} -\begin{flushleft} -\# Start interface initialization.\\ -\#\\ -\# {\bf Step 0} --- enable device \verb|$dev| -\end{flushleft} -\begin{verbatim} -if ! ip link set up dev $dev ; then - echo "Error: cannot enable interface $dev." 1>&2 - exit 1 -fi -if [ "$ipaddr" = "" ]; then exit 0; fi -\end{verbatim} -\begin{flushleft} -\# {\bf Step 1} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\ -\# Send two probes and wait for result for 3 seconds.\\ -\# If the interface opens slower f.e.\ due to long media detection,\\ -\# you want to increase the timeout.\\ -\end{flushleft} -\begin{verbatim} -if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then - echo "Error: some host already uses address $ipaddr on $dev." 1>&2 - exit 1 -fi -\end{verbatim} -\begin{flushleft} -\# OK, the address is unique. We may add it to the interface.\\ -\#\\ -\# {\bf Step 2} --- Configure the address on the interface. -\end{flushleft} -\begin{verbatim} -if ! ip address add $pfx brd + dev $dev $label; then - echo "Error: failed to add $pfx on $dev." 1>&2 - exit 1 -fi -\end{verbatim} -\noindent\# {\bf Step 3} --- Announce our presence on the link -\begin{verbatim} -arping -q -A -c 1 -I $dev $ipaddr -noarp=$? -( sleep 2 ; - arping -q -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null & -\end{verbatim} -\begin{flushleft} -\# {\bf Step 4} (optional) --- Add some control routes.\\ -\#\\ -\# 1. Prohibit link local multicast addresses.\\ -\# 2. Prohibit link local (alias, limited) broadcast.\\ -\# 3. Add default multicast route. -\end{flushleft} -\begin{verbatim} -ip route add unreachable 224.0.0.0/24 >& /dev/null -ip route add unreachable 255.255.255.255 >& /dev/null -if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then - ip route add 224.0.0.0/4 dev $dev scope global >& /dev/null -fi -\end{verbatim} -\begin{flushleft} -\# {\bf Step 5} --- Add fallback default route with huge metric.\\ -\# If a proxy ARP server is present on the interface, we will be\\ -\# able to talk to all the Internet without further configuration.\\ -\# Do not make this step on router or if the device is not ARPable.\\ -\# because dead nexthop detection does not work on them. -\end{flushleft} -\begin{verbatim} -if [ $fwd -eq 0 ]; then - if [ $noarp -eq 0 ]; then - ip ro append default dev $dev metric 30000 scope global - elif [ "$peer" != "" ]; then - if ping -q -c 2 -w 4 $peer ; then - ip ro append default via $peer dev $dev metric 30001 - fi - fi - RestartRDISC -fi - -exit 0 -\end{verbatim} -\begin{flushleft} -\# End of {\bf MAIN()} -\end{flushleft} - - -\end{document} diff --git a/doc/ip-tunnels.tex b/doc/ip-tunnels.tex deleted file mode 100644 index 0a8c930c..00000000 --- a/doc/ip-tunnels.tex +++ /dev/null @@ -1,469 +0,0 @@ -\documentstyle[12pt,twoside]{article} -\def\TITLE{Tunnels over IP} -\input preamble -\begin{center} -\Large\bf Tunnels over IP in Linux-2.2 -\end{center} - - -\begin{center} -{ \large Alexey~N.~Kuznetsov } \\ -\em Institute for Nuclear Research, Moscow \\ -\verb|kuznet@ms2.inr.ac.ru| \\ -\rm March 17, 1999 -\end{center} - -\vspace{5mm} - -\tableofcontents - - -\section{Instead of introduction: micro-FAQ.} - -\begin{itemize} - -\item -Q: In linux-2.0.36 I used: -\begin{verbatim} - ifconfig tunl1 10.0.0.1 pointopoint 193.233.7.65 -\end{verbatim} -to create tunnel. It does not work in 2.2.0! - -A: You are right, it does not work. The command written above is split to two commands. -\begin{verbatim} - ip tunnel add MY-TUNNEL mode ipip remote 193.233.7.65 -\end{verbatim} -will create tunnel device with name \verb|MY-TUNNEL|. Now you may configure -it with: -\begin{verbatim} - ifconfig MY-TUNNEL 10.0.0.1 -\end{verbatim} -Certainly, if you prefer name \verb|tunl1| to \verb|MY-TUNNEL|, -you still may use it. - -\item -Q: In linux-2.0.36 I used: -\begin{verbatim} - ifconfig tunl0 10.0.0.1 - route add -net 10.0.0.0 gw 193.233.7.65 dev tunl0 -\end{verbatim} -to tunnel net 10.0.0.0 via router 193.233.7.65. It does not -work in 2.2.0! Moreover, \verb|route| prints a funny error sort of -``network unreachable'' and after this I found a strange direct route -to 10.0.0.0 via \verb|tunl0| in routing table. - -A: Yes, in 2.2 the rule that {\em normal} gateway must reside on directly -connected network has not any exceptions. You may tell kernel, that -this particular route is {\em abnormal}: -\begin{verbatim} - ifconfig tunl0 10.0.0.1 netmask 255.255.255.255 - ip route add 10.0.0.0/8 via 193.233.7.65 dev tunl0 onlink -\end{verbatim} -Note keyword \verb|onlink|, it is the magic key that orders kernel -not to check for consistency of gateway address. -Probably, after this explanation you have already guessed another method -to cheat kernel: -\begin{verbatim} - ifconfig tunl0 10.0.0.1 netmask 255.255.255.255 - route add -host 193.233.7.65 dev tunl0 - route add -net 10.0.0.0 netmask 255.0.0.0 gw 193.233.7.65 - route del -host 193.233.7.65 dev tunl0 -\end{verbatim} -Well, if you like such tricks, nobody may prohibit you to use them. -Only do not forget -that between \verb|route add| and \verb|route del| host 193.233.7.65 is -unreachable. - -\item -Q: In 2.0.36 I used to load \verb|tunnel| device module and \verb|ipip| module. -I cannot find any \verb|tunnel| in 2.2! - -A: Linux-2.2 has single module \verb|ipip| for both directions of tunneling -and for all IPIP tunnel devices. - -\item -Q: \verb|traceroute| does not work over tunnel! Well, stop... It works, - only skips some number of hops. - -A: Yes. By default tunnel driver copies \verb|ttl| value from -inner packet to outer one. It means that path traversed by tunneled -packets to another endpoint is not hidden. If you dislike this, or if you -are going to use some routing protocol expecting that packets -with ttl 1 will reach peering host (f.e.\ RIP, OSPF or EBGP) -and you are not afraid of -tunnel loops, you may append option \verb|ttl 64|, when creating tunnel -with \verb|ip tunnel add|. - -\item -Q: ... Well, list of things, which 2.0 was able to do finishes. - -\end{itemize} - -\paragraph{Summary of differences between 2.2 and 2.0.} - -\begin{itemize} - -\item {\bf In 2.0} you could compile tunnel device into kernel - and got set of 4 devices \verb|tunl0| ... \verb|tunl3| or, - alternatively, compile it as module and load new module - for each new tunnel. Also, module \verb|ipip| was necessary - to receive tunneled packets. - - {\bf 2.2} has {\em one\/} module \verb|ipip|. Loading it you get base - tunnel device \verb|tunl0| and another tunnels may be created with command - \verb|ip tunnel add|. These new devices may have arbitrary names. - - -\item {\bf In 2.0} you set remote tunnel endpoint address with - the command \verb|ifconfig| ... \verb|pointopoint A|. - - {\bf In 2.2} this command has the same semantics on all - the interfaces, namely it sets not tunnel endpoint, - but address of peering host, which is directly reachable - via this tunnel, - rather than via Internet. Actual tunnel endpoint address \verb|A| - should be set with \verb|ip tunnel add ... remote A|. - -\item {\bf In 2.0} you create tunnel routes with the command: -\begin{verbatim} - route add -net 10.0.0.0 gw A dev tunl0 -\end{verbatim} - - {\bf 2.2} interprets this command equally for all device - kinds and gateway is required to be directly reachable via this tunnel, - rather than via Internet. You still may use \verb|ip route add ... onlink| - to override this behaviour. - -\end{itemize} - - -\section{Tunnel setup: basics} - -Standard Linux-2.2 kernel supports three flavor of tunnels, -listed in the following table: -\vspace{2mm} - -\begin{tabular}{lll} -\vrule depth 0.8ex width 0pt\relax -Mode & Description & Base device \\ -ipip & IP over IP & tunl0 \\ -sit & IPv6 over IP & sit0 \\ -gre & ANY over GRE over IP & gre0 -\end{tabular} - -\vspace{2mm} - -\noindent All the kinds of tunnels are created with one command: -\begin{verbatim} - ip tunnel add <NAME> mode <MODE> [ local <S> ] [ remote <D> ] -\end{verbatim} - -This command creates new tunnel device with name \verb|<NAME>|. -The \verb|<NAME>| is an arbitrary string. Particularly, -it may be even \verb|eth0|. The rest of parameters set -different tunnel characteristics. - -\begin{itemize} - -\item -\verb|mode <MODE>| sets tunnel mode. Three modes are available now - \verb|ipip|, \verb|sit| and \verb|gre|. - -\item -\verb|remote <D>| sets remote endpoint of the tunnel to IP - address \verb|<D>|. -\item -\verb|local <S>| sets fixed local address for tunneled - packets. It must be an address on another interface of this host. - -\end{itemize} - -\let\thefootnote\oldthefootnote - -Both \verb|remote| and \verb|local| may be omitted. In this case we -say that they are zero or wildcard. Two tunnels of one mode cannot -have the same \verb|remote| and \verb|local|. Particularly it means -that base device or fallback tunnel cannot be replicated.\footnote{ -This restriction is relaxed for keyed GRE tunnels.} - -Tunnels are divided to two classes: {\bf pointopoint} tunnels, which -have some not wildcard \verb|remote| address and deliver all the packets -to this destination, and {\bf NBMA} (i.e. Non-Broadcast Multi-Access) tunnels, -which have no \verb|remote|. Particularly, base devices (f.e.\ \verb|tunl0|) -are NBMA, because they have neither \verb|remote| nor -\verb|local| addresses. - - -After tunnel device is created you should configure it as you did -it with another devices. Certainly, the configuration of tunnels has -some features related to the fact that they work over existing Internet -routing infrastructure and simultaneously create new virtual links, -which changes this infrastructure. The danger that not enough careful -tunnel setup will result in formation of tunnel loops, -collapse of routing or flooding network with exponentially -growing number of tunneled fragments is very real. - - -Protocol setup on pointopoint tunnels does not differ of configuration -of another devices. You should set a protocol address with \verb|ifconfig| -and add routes with \verb|route| utility. - -NBMA tunnels are different. To route something via NBMA tunnel -you have to explain to driver, where it should deliver packets to. -The only way to make it is to create special routes with gateway -address pointing to desired endpoint. F.e.\ -\begin{verbatim} - ip route add 10.0.0.0/24 via <A> dev tunl0 onlink -\end{verbatim} -It is important to use option \verb|onlink|, otherwise -kernel will refuse request to create route via gateway not directly -reachable over device \verb|tunl0|. With IPv6 the situation is much simpler: -when you start device \verb|sit0|, it automatically configures itself -with all IPv4 addresses mapped to IPv6 space, so that all IPv4 -Internet is {\em really reachable} via \verb|sit0|! Excellent, the command -\begin{verbatim} - ip route add 3FFE::/16 via ::193.233.7.65 dev sit0 -\end{verbatim} -will route \verb|3FFE::/16| via \verb|sit0|, sending all the packets -destined to this prefix to 193.233.7.65. - -\section{Tunnel setup: options} - -Command \verb|ip tunnel add| has several additional options. -\begin{itemize} - -\item \verb|ttl N| --- set fixed TTL \verb|N| on tunneled packets. - \verb|N| is number in the range 1--255. 0 is special value, - meaning that packets inherit TTL value. - Default value is: \verb|inherit|. - -\item \verb|tos T| --- set fixed tos \verb|T| on tunneled packets. - Default value is: \verb|inherit|. - -\item \verb|dev DEV| --- bind tunnel to device \verb|DEV|, so that - tunneled packets will be routed only via this device and will - not be able to escape to another device, when route to endpoint changes. - -\item \verb|nopmtudisc| --- disable Path MTU Discovery on this tunnel. - It is enabled by default. Note that fixed ttl is incompatible - with this option: tunnels with fixed ttl always make pmtu discovery. - -\end{itemize} - -\verb|ipip| and \verb|sit| tunnels have no more options. \verb|gre| -tunnels are more complicated: - -\begin{itemize} - -\item \verb|key K| --- use keyed GRE with key \verb|K|. \verb|K| is - either number or IP address-like dotted quad. - -\item \verb|csum| --- checksum tunneled packets. - -\item \verb|seq| --- serialize packets. -\begin{NB} - I think this option does not - work. At least, I did not test it, did not debug it and - even do not understand, how it is supposed to work and for what - purpose Cisco planned to use it. -\end{NB} - -\end{itemize} - - -Actually, these GRE options can be set separately for input and -output directions by prefixing corresponding keywords with letter -\verb|i| or \verb|o|. F.e.\ \verb|icsum| orders to accept only -packets with correct checksum and \verb|ocsum| means, that -our host will calculate and send checksum. - -Command \verb|ip tunnel add| is not the only operation, -which can be made with tunnels. Certainly, you may get short help page -with: -\begin{verbatim} - ip tunnel help -\end{verbatim} - -Besides that, you may view list of installed tunnels with the help of command: -\begin{verbatim} - ip tunnel ls -\end{verbatim} -Also you may look at statistics: -\begin{verbatim} - ip -s tunnel ls Cisco -\end{verbatim} -where \verb|Cisco| is name of tunnel device. Command -\begin{verbatim} - ip tunnel del Cisco -\end{verbatim} -destroys tunnel \verb|Cisco|. And, finally, -\begin{verbatim} - ip tunnel change Cisco mode sit local ME remote HE ttl 32 -\end{verbatim} -changes its parameters. - -\section{Differences 2.2 and 2.0 tunnels revisited.} - -Now we can discuss more subtle differences between tunneling in 2.0 -and 2.2. - -\begin{itemize} - -\item In 2.0 all tunneled packets were received promiscuously -as soon as you loaded module \verb|ipip|. 2.2 tries to select the best -tunnel device and packet looks as received on this. F.e.\ if host -received \verb|ipip| packet from host \verb|D| destined to our -local address \verb|S|, kernel searches for matching tunnels -in order: - -\begin{tabular}{ll} -1 & \verb|remote| is \verb|D| and \verb|local| is \verb|S| \\ -2 & \verb|remote| is \verb|D| and \verb|local| is wildcard \\ -3 & \verb|remote| is wildcard and \verb|local| is \verb|S| \\ -4 & \verb|tunl0| -\end{tabular} - -If tunnel exists, but it is not in \verb|UP| state, the tunnel is ignored. -Note, that if \verb|tunl0| is \verb|UP| it receives all the IPIP packets, -not acknowledged by more specific tunnels. -Be careful, it means that without carefully installed firewall rules -anyone on the Internet may inject to your network any packets with -source addresses indistinguishable from local ones. It is not so bad idea -to design tunnels in the way enforcing maximal route symmetry -and to enable reversed path filter (\verb|rp_filter| sysctl option) on -tunnel devices. - -\item In 2.2 you can monitor and debug tunnels with \verb|tcpdump|. -F.e.\ \verb|tcpdump| \verb|-i Cisco| \verb|-nvv| will dump packets, -which kernel output, via tunnel \verb|Cisco| and the packets received on it -from kernel viewpoint. - -\end{itemize} - - -\section{Linux and Cisco IOS tunnels.} - -Among another tunnels Cisco IOS supports IPIP and GRE. -Essentially, Cisco setup is subset of options, available for Linux. -Let us consider the simplest example: - -\begin{verbatim} -interface Tunnel0 - tunnel mode gre ip - tunnel source 10.10.14.1 - tunnel destination 10.10.13.2 -\end{verbatim} - - -This command set translates to: - -\begin{verbatim} - ip tunnel add Tunnel0 \ - mode gre \ - local 10.10.14.1 \ - remote 10.10.13.2 -\end{verbatim} - -Any questions? No questions. - -\section{Interaction IPIP tunnels and DVMRP.} - -DVMRP exploits IPIP tunnels to route multicasts via Internet. -\verb|mrouted| creates -IPIP tunnels listed in its configuration file automatically. -From kernel and user viewpoints there are no differences between -tunnels, created in this way, and tunnels created by \verb|ip tunnel|. -I.e.\ if \verb|mrouted| created some tunnel, it may be used to -route unicast packets, provided appropriate routes are added. -And vice versa, if administrator has already created a tunnel, -it will be reused by \verb|mrouted|, if it requests DVMRP -tunnel with the same local and remote addresses. - -Do not wonder, if your manually configured tunnel is -destroyed, when mrouted exits. - - -\section{Broadcast GRE ``tunnels''.} - -It is possible to set \verb|remote| for GRE tunnel to a multicast -address. Such tunnel becomes {\bf broadcast} tunnel (though word -tunnel is not quite appropriate in this case, it is rather virtual network). -\begin{verbatim} - ip tunnel add Universe local 193.233.7.65 \ - remote 224.66.66.66 ttl 16 - ip addr add 10.0.0.1/16 dev Universe - ip link set Universe up -\end{verbatim} -This tunnel is true broadcast network and broadcast packets are -sent to multicast group 224.66.66.66. By default such tunnel starts -to resolve both IP and IPv6 addresses via ARP/NDISC, so that -if multicast routing is supported in surrounding network, all GRE nodes -will find one another automatically and will form virtual Ethernet-like -broadcast network. If multicast routing does not work, it is unpleasant -but not fatal flaw. The tunnel becomes NBMA rather than broadcast network. -You may disable dynamic ARPing by: -\begin{verbatim} - echo 0 > /proc/sys/net/ipv4/neigh/Universe/mcast_solicit -\end{verbatim} -and to add required information to ARP tables manually: -\begin{verbatim} - ip neigh add 10.0.0.2 lladdr 128.6.190.2 dev Universe nud permanent -\end{verbatim} -In this case packets sent to 10.0.0.2 will be encapsulated in GRE -and sent to 128.6.190.2. It is possible to facilitate address resolution -using methods typical for another NBMA networks f.e.\ to start user -level \verb|arpd| daemon, which will maintain database of hosts attached -to GRE virtual network or ask for information -dedicated ARP or NHRP server. - - -Actually, such setup is the most natural for tunneling, -it is really flexible, scalable and easily managable, so that -it is strongly recommended to be used with GRE tunnels instead of ugly -hack with NBMA mode and \verb|onlink| modifier. Unfortunately, -by historical reasons broadcast mode is not supported by IPIP tunnels, -but this probably will change in future. - - - -\section{Traffic control issues.} - -Tunnels are devices, hence all the power of Linux traffic control -applies to them. The simplest (and the most useful in practice) -example is limiting tunnel bandwidth. The following command: -\begin{verbatim} - tc qdisc add dev tunl0 root tbf \ - rate 128Kbit burst 4K limit 10K -\end{verbatim} -will limit tunneled traffic to 128Kbit with maximal burst size of 4K -and queuing not more than 10K. - -However, you should remember, that tunnels are {\em virtual} devices -implemented in software and true queue management is impossible for them -just because they have no queues. Instead, it is better to create classes -on real physical interfaces and to map tunneled packets to them. -In general case of dynamic routing you should create such classes -on all outgoing interfaces, or, alternatively, -to use option \verb|dev DEV| to bind tunnel to a fixed physical device. -In the last case packets will be routed only via specified device -and you need to setup corresponding classes only on it. -Though you have to pay for this convenience, -if routing will change, your tunnel will fail. - -Suppose that CBQ class \verb|1:ABC| has been created on device \verb|eth0| -specially for tunnel \verb|Cisco| with endpoints \verb|S| and \verb|D|. -Now you can select IPIP packets with addresses \verb|S| and \verb|D| -with some classifier and map them to class \verb|1:ABC|. F.e.\ -it is easy to make with \verb|rsvp| classifier: -\begin{verbatim} - tc filter add dev eth0 pref 100 proto ip rsvp \ - session D ipproto ipip filter S \ - classid 1:ABC -\end{verbatim} - -If you want to make more detailed classification of sub-flows -transmitted via tunnel, you can build CBQ subtree, -rooted at \verb|1:ABC| and attach to subroot set of rules parsing -IPIP packets more deeply. - -\end{document} diff --git a/doc/nstat.sgml b/doc/nstat.sgml deleted file mode 100644 index 48cacc69..00000000 --- a/doc/nstat.sgml +++ /dev/null @@ -1,110 +0,0 @@ -<!doctype linuxdoc system> - -<article> - -<title>NSTAT, IFSTAT and RTACCT Utilities -<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/ -<date>some_negative_number, 20 Sep 2001 -<abstract> -<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping -to monitor kernel snmp counters and network interface statistics. -</abstract> - -<p> These utilities are very similar, so that I describe -them simultaneously, using name <tt/Xstat/ in the places which apply -to all of them. - -<p>The format of the command is: - -<tscreen><verb> - Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ] -</verb></tscreen> - -<p> -<tt/PATTERN/ is shell style pattern, selecting identifier -of SNMP variables or interfaces to show. Variable is displayed -if one of patterns matches its name. If no patterns are given, -<tt/Xstat/ assumes that user wants to see all the variables. - -<p> <tt/OPTIONS/ is list of single letter options, using common unix -conventions. - -<itemize> -<item><tt/-h/ - show help page -<item><tt/-?/ - the same, of course -<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit -<item><tt/-z/ - dump zero counters too. By default they are not shown. -<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/ - calculates increments since the previous use. -<item><tt/-s/ - do not update history, so that the next time you will - see counters including values accumulated to the moment - of this measurement too. -<item><tt/-n/ - do not display anything, only update history. -<item><tt/-r/ - reset history. -<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting - statistics. <tt/INTERVAL/ is interval between measurements - in seconds. -<item><tt/-t INTERVAL/ - time interval to average rates. Default value - is 60 seconds. -<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only). -</itemize> - -<p> -History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt> -or in file given by environment variables <tt/NSTAT_HISTORY/, -<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/. -Each time when you use <tt/Xstat/ values there are updated. -If you use patterns, only the values which you _really_ see -are updated. If you want to skip an unintersting period, -use option <tt/-n/, or just output to <tt>/dev/null</tt>. - -<p> -<tt/Xstat/ understands when history is invalidated by system reboot -or source of information switched between different instances -of daemonic <tt/Xstat/ and kernel SNMP tables and does not -use invalid history. - -<p> Beware, <tt/Xstat/ will not produce sane output, -when many processes use it simultaneously. If several processes -under single user need this utility they should use environment -variables to put their history in safe places -or to use it with options <tt/-a -s/. - -<p> -Well, that's all. The utility is very simple, but nevertheless -very handy. - -<p> <bf/Output of XSTAT/ -<p> The first line of output is <tt/#/ followed by identifier -of source of information, it may be word <tt/kernel/, when <tt/Xstat/ -gets information from kernel or some dotted decimal number followed -by parameters, when it obtains information from running <tt/Xstat/ daemon. - -<p>In the case of <tt/nstat/ the rest of output consists of three columns: -SNMP MIB identifier, -its value (or increment since previous measurement) and average -rate of increase of the counter per second. <tt/ifstat/ outputs -interface name followed by pairs of counter and rate of its change. - -<p> <bf/Daemonic Xstat/ -<p> <tt/Xstat/ may be started as daemon by any user. This makes sense -to avoid wrapped counters and to obtain reasonable long counters -for large time. Also <tt/Xstat/ daemon calculates average rates. -For the first goal sampling interval (option <tt/-d/) may be large enough, -f.e. for gigabit rates byte counters overflow not more frequently than -each 40 seconds and you may select interval of 20 seconds. -From the other hand, when <tt/Xstat/ is used for estimating rates -interval should be less than averaging period (option <tt/-t/), otherwise -estimation loses in quality. - -Client <tt/Xstat/, before trying to get information from the kernel, -contacts daemon started by this user, then it tries system wide -daemon, which is supposed to be started by superuser. And only if -none of them replied it gets information from kernel. - -<p> <bf/Environment/ -<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/. -<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/. -<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/. - -</article> diff --git a/doc/preamble.tex b/doc/preamble.tex deleted file mode 100644 index 80ca5087..00000000 --- a/doc/preamble.tex +++ /dev/null @@ -1,26 +0,0 @@ -\textwidth 6.0in -\textheight 8.5in - -\input SNAPSHOT - -\pagestyle{myheadings} -\markboth{\protect\TITLE}{} -\markright{{\protect\sc iproute2-ss\Draft}} - -% To print it in compact form: both sides on one sheet (psnup -2) -\evensidemargin=\oddsidemargin - -\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB. -}{\par\egroup \vskip 1mm} - -\def\threeonly{[2.3.15+ only] } - -\begin{document} - -\makeatletter -\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}} -\makeatother -\let\oldthefootnote\thefootnote -\def\thefootnote{} -\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov} - diff --git a/doc/rtstat.sgml b/doc/rtstat.sgml deleted file mode 100644 index 07391c39..00000000 --- a/doc/rtstat.sgml +++ /dev/null @@ -1,52 +0,0 @@ -<!doctype linuxdoc system> - -<article> - -<title>RTACCT Utility -<author>Robert Olsson -<date>some_negative_number, 20 Dec 2001 - -<p> -Here is some code for monitoring the route cache. For systems handling high -network load, servers, routers, firewalls etc the route cache and its garbage -collection is crucial. Linux has a solid implementation. - -<p> -The kernel patch (not required since linux-2.4.7) adds statistics counters -from route cache process into -/proc/net/rt_cache_stat. A companion user mode program presents the statistics -in a vmstat or iostat manner. The ratio between cache hits and misses gives -the flow length. - -<p> -Hopefully it can help understanding performance and DoS and other related -issues. - -<p> An URL where newer versions of this utility can be (probably) found -is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/ - - -<p><bf/Description/ - -<p>The format of the command is: - -<tscreen><verb> - rtstat [ OPTIONS ] -</verb></tscreen> - -<p> <tt/OPTIONS/ are: - -<itemize> - -<item><tt/-h/, <tt/-help/ - show help page and version of the utility. - -<item><tt/-i INTERVAL/ - interval between snapshots, default value is -2 seconds. - -<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line, -1 prescribes to print it once and 2 (this is default setting) forces header -line each 20 lines. - -</itemize> - -</article> diff --git a/doc/ss.sgml b/doc/ss.sgml deleted file mode 100644 index 3024b574..00000000 --- a/doc/ss.sgml +++ /dev/null @@ -1,525 +0,0 @@ -<!doctype linuxdoc system> - -<article> - -<title>SS Utility: Quick Intro -<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/ -<date>some_negative_number, 20 Sep 2001 -<abstract> -<tt/ss/ is one another utility to investigate sockets. -Functionally it is NOT better than <tt/netstat/ combined -with some perl/awk scripts and though it is surely faster -it is not enough to make it much better. :-) -So, stop reading this now and do not waste your time. -Well, certainly, it proposes some functionality, which current -netstat is still not able to do, but surely will soon. -</abstract> - -<sect>Why? - -<p> <tt>/proc</tt> interface is inadequate, unfortunately. -When amount of sockets is enough large, <tt/netstat/ or even -plain <tt>cat /proc/net/tcp/</tt> cause nothing but pains and curses. -In linux-2.4 the desease became worse: even if amount -of sockets is small reading <tt>/proc/net/tcp/</tt> is slow enough. - -This utility presents a new approach, which is supposed to scale -well. I am not going to describe technical details here and -will concentrate on description of the command. -The only important thing to say is that it is not so bad idea -to load module <tt/tcp_diag/, which can be found in directory -<tt/Modules/ of <tt/iproute2/. If you do not make this <tt/ss/ -will work, but it falls back to <tt>/proc</tt> and becomes slow -like <tt/netstat/, well, a bit faster yet (see section "Some numbers"). - -<sect>Old news - -<p> -In the simplest form <tt/ss/ is equivalent to netstat -with some small deviations. - -<itemize> -<item><tt/ss -t -a/ dumps all TCP sockets -<item><tt/ss -u -a/ dumps all UDP sockets -<item><tt/ss -w -a/ dumps all RAW sockets -<item><tt/ss -x -a/ dumps all UNIX sockets -</itemize> - -<p> -Option <tt/-o/ shows TCP timers state. -Option <tt/-e/ shows some extended information. -Etc. etc. etc. Seems, all the options of netstat related to sockets -are supported. Though not AX.25 and other bizarres. :-) -If someone wants, he can make support for decnet and ipx. -Some rudimentary support for them is already present in iproute2 libutils, -and I will be glad to see these new members. - -<p> -However, standard functionality is a bit different: - -<p> -The first: without option <tt/-a/ sockets in states -<tt/TIME-WAIT/ and <tt/SYN-RECV/ are skipped too. -It is more reasonable default, I think. - -<p> -The second: format of UNIX sockets is different. It coincides -with tcp/udp. Though standard kernel still does not allow to -see write/read queues and peer address of connected UNIX sockets, -the patch doing this exists. - -<p> -The third: default is to dump only TCP sockets, rather than all of the types. - -<p> -The next: by default it does not resolve numeric host addresses (like <tt/ip/)! -Resolving is enabled with option <tt/-r/. Service names, usually stored -in local files, are resolved by default. Also, if service database -does not contain references to a port, <tt/ss/ queries system -<tt/rpcbind/. RPC services are prefixed with <tt/rpc./ -Resolution of services may be suppressed with option <tt/-n/. - -<p> -It does not accept "long" options (I dislike them, sorry). -So, address family is given with family identifier following -option <tt/-f/ to be algined to iproute2 conventions. -Mostly, it is to allow option parser to parse -addresses correctly, but as side effect it really limits dumping -to sockets supporting only given family. Option <tt/-A/ followed -by list of socket tables to dump is also supported. -Logically, id of socket table is different of _address_ family, which is -another point of incompatibility. So, id is one of -<tt/all/, <tt/tcp/, <tt/udp/, -<tt/raw/, <tt/inet/, <tt/unix/, <tt/packet/, <tt/netlink/. See? -Well, <tt/inet/ is just abbreviation for <tt/tcp|udp|raw/ -and it is not difficult to guess that <tt/packet/ allows -to look at packet sockets. Actually, there are also some other abbreviations, -f.e. <tt/unix_dgram/ selects only datagram UNIX sockets. - -<p> -The next: well, I still do not know. :-) - - - - -<sect>Time to talk about new functionality. - -<p>It is builtin filtering of socket lists. - -<sect1> Filtering by state. - -<p> -<tt/ss/ allows to filter socket states, using keywords -<tt/state/ and <tt/exclude/, followed by some state -identifier. - -<p> -State identifier are standard TCP state names (not listed, -they are useless for you if you already do not know them) -or abbreviations: - -<itemize> -<item><tt/all/ - for all the states -<item><tt/bucket/ - for TCP minisockets (<tt/TIME-WAIT|SYN-RECV/) -<item><tt/big/ - all except for minisockets -<item><tt/connected/ - not closed and not listening -<item><tt/synchronized/ - connected and not <tt/SYN-SENT/ -</itemize> - -<p> - F.e. to dump all tcp sockets except <tt/SYN-RECV/: - -<tscreen><verb> - ss exclude SYN-RECV -</verb></tscreen> - -<p> - If neither <tt/state/ nor <tt/exclude/ directives - are present, - state filter defaults to <tt/all/ with option <tt/-a/ - or to <tt/all/, - excluding listening, syn-recv, time-wait and closed sockets. - -<sect1> Filtering by addresses and ports. - -<p> -Option list may contain address/port filter. -It is boolean expression which consists of boolean operation -<tt/or/, <tt/and/, <tt/not/ and predicates. -Actually, all the flavors of names for boolean operations are eaten: -<tt/&/, <tt/&&/, <tt/|/, <tt/||/, <tt/!/, but do not forget -about special sense given to these symbols by unix shells and escape -them correctly, when used from command line. - -<p> -Predicates may be of the folowing kinds: - -<itemize> -<item>A. Address/port match, where address is checked against mask - and port is either wildcard or exact. It is one of: - -<tscreen><verb> - dst prefix:port - src prefix:port - src unix:STRING - src link:protocol:ifindex - src nl:channel:pid -</verb></tscreen> - - Both prefix and port may be absent or replaced with <tt/*/, - which means wildcard. UNIX socket use more powerful scheme - matching to socket names by shell wildcards. Also, prefixes - unix: and link: may be omitted, if address family is evident - from context (with option <tt/-x/ or with <tt/-f unix/ - or with <tt/unix/ keyword) - -<p> - F.e. - -<tscreen><verb> - dst 10.0.0.1 - dst 10.0.0.1: - dst 10.0.0.1/32: - dst 10.0.0.1:* -</verb></tscreen> - are equivalent and mean socket connected to - any port on host 10.0.0.1 - -<tscreen><verb> - dst 10.0.0.0/24:22 -</verb></tscreen> - sockets connected to port 22 on network - 10.0.0.0...255. - -<p> - Note that port separated of address with colon, which creates - troubles with IPv6 addresses. Generally, we interpret the last - colon as splitting port. To allow to give IPv6 addresses, - trick like used in IPv6 HTTP URLs may be used: - -<tscreen><verb> - dst [::1] -</verb></tscreen> - are sockets connected to ::1 on any port - -<p> - Another way is <tt/dst ::1/128/. / helps to understand that - colon is part of IPv6 address. - -<p> - Now we can add another alias for <tt/dst 10.0.0.1/: - <tt/dst [10.0.0.1]/. :-) - -<p> Address may be a DNS name. In this case all the addresses are looked - up (in all the address families, if it is not limited by option <tt/-f/ - or special address prefix <tt/inet:/, <tt/inet6/) and resulting - expression is <tt/or/ over all of them. - -<item> B. Port expressions: -<tscreen><verb> - dport >= :1024 - dport != :22 - sport < :32000 -</verb></tscreen> - etc. - - All the relations: <tt/</, <tt/>/, <tt/=/, <tt/>=/, <tt/=/, <tt/==/, - <tt/!=/, <tt/eq/, <tt/ge/, <tt/lt/, <tt/ne/... - Use variant which you like more, but not forget to escape special - characters when typing them in command line. :-) - - Note that port number syntactically coincides to the case A! - You may even add an IP address, but it will not participate - incomparison, except for <tt/==/ and <tt/!=/, which are equivalent - to corresponding predicates of type A. F.e. -<p> -<tt/dst 10.0.0.1:22/ - is equivalent to <tt/dport eq 10.0.0.1:22/ - and - <tt/not dst 10.0.0.1:22/ is equivalent to - <tt/dport neq 10.0.0.1:22/ - -<item>C. Keyword <tt/autobound/. It matches to sockets bound automatically - on local system. - -</itemize> - - -<sect> Examples - -<p> -<itemize> -<item>1. List all the tcp sockets in state <tt/FIN-WAIT-1/ for our apache - to network 193.233.7/24 and look at their timers: - -<tscreen><verb> - ss -o state fin-wait-1 \( sport = :http or sport = :https \) \ - dst 193.233.7/24 -</verb></tscreen> - - Oops, forgot to say that missing logical operation is - equivalent to <tt/and/. - -<item> 2. Well, now look at the rest... - -<tscreen><verb> - ss -o excl fin-wait-1 - ss state fin-wait-1 \( sport neq :http and sport neq :https \) \ - or not dst 193.233.7/24 -</verb></tscreen> - - Note that we have to do _two_ calls of ss to do this. - State match is always anded to address/port match. - The reason for this is purely technical: ss does fast skip of - not matching states before parsing addresses and I consider the - ability to skip fastly gobs of time-wait and syn-recv sockets - as more important than logical generality. - -<item> 3. So, let's look at all our sockets using autobound ports: - -<tscreen><verb> - ss -a -A all autobound -</verb></tscreen> - - -<item> 4. And eventually find all the local processes connected - to local X servers: - -<tscreen><verb> - ss -xp dst "/tmp/.X11-unix/*" -</verb></tscreen> - - Pardon, this does not work with current kernel, patching is required. - But we still can look at server side: - -<tscreen><verb> - ss -x src "/tmp/.X11-unix/*" -</verb></tscreen> - -</itemize> - - -<sect> Returning to ground: real manual - -<p> -<sect1> Command arguments - -<p> General format of arguments to <tt/ss/ is: - -<tscreen><verb> - ss [ OPTIONS ] [ STATE-FILTER ] [ ADDRESS-FILTER ] -</verb></tscreen> - -<sect2><tt/OPTIONS/ -<p> <tt/OPTIONS/ is list of single letter options, using common unix -conventions. - -<itemize> -<item><tt/-h/ - show help page -<item><tt/-?/ - the same, of course -<item><tt/-v/, <tt/-V/ - print version of <tt/ss/ and exit -<item><tt/-s/ - print summary statistics. This option does not parse -socket lists obtaining summary from various sources. It is useful -when amount of sockets is so huge that parsing <tt>/proc/net/tcp</tt> -is painful. -<item><tt/-D FILE/ - do not display anything, just dump raw information -about TCP sockets to <tt/FILE/ after applying filters. If <tt/FILE/ is <tt/-/ -<tt/stdout/ is used. -<item><tt/-F FILE/ - read continuation of filter from <tt/FILE/. -Each line of <tt/FILE/ is interpreted like single command line option. -If <tt/FILE/ is <tt/-/ <tt/stdin/ is used. -<item><tt/-r/ - try to resolve numeric address/ports -<item><tt/-n/ - do not try to resolve ports -<item><tt/-o/ - show some optional information, f.e. TCP timers -<item><tt/-i/ - show some infomration specific to TCP (RTO, congestion -window, slow start threshould etc.) -<item><tt/-e/ - show even more optional information -<item><tt/-m/ - show extended information on memory used by the socket. -It is available only with <tt/tcp_diag/ enabled. -<item><tt/-p/ - show list of processes owning the socket -<item><tt/-f FAMILY/ - default address family used for parsing addresses. - Also this option limits listing to sockets supporting - given address family. Currently the following families - are supported: <tt/unix/, <tt/inet/, <tt/inet6/, <tt/link/, - <tt/netlink/. -<item><tt/-4/ - alias for <tt/-f inet/ -<item><tt/-6/ - alias for <tt/-f inet6/ -<item><tt/-0/ - alias for <tt/-f link/ -<item><tt/-A LIST-OF-TABLES/ - list of socket tables to dump, separated - by commas. The following identifiers are understood: - <tt/all/, <tt/inet/, <tt/tcp/, <tt/udp/, <tt/raw/, - <tt/unix/, <tt/packet/, <tt/netlink/, <tt/unix_dgram/, - <tt/unix_stream/, <tt/packet_raw/, <tt/packet_dgram/. -<item><tt/-x/ - alias for <tt/-A unix/ -<item><tt/-t/ - alias for <tt/-A tcp/ -<item><tt/-u/ - alias for <tt/-A udp/ -<item><tt/-w/ - alias for <tt/-A raw/ -<item><tt/-a/ - show sockets of all the states. By default sockets - in states <tt/LISTEN/, <tt/TIME-WAIT/, <tt/SYN_RECV/ - and <tt/CLOSE/ are skipped. -<item><tt/-l/ - show only sockets in state <tt/LISTEN/ -</itemize> - -<sect2><tt/STATE-FILTER/ - -<p><tt/STATE-FILTER/ allows to construct arbitrary set of -states to match. Its syntax is sequence of keywords <tt/state/ -and <tt/exclude/ followed by identifier of state. -Available identifiers are: - -<p> -<itemize> -<item> All standard TCP states: <tt/established/, <tt/syn-sent/, -<tt/syn-recv/, <tt/fin-wait-1/, <tt/fin-wait-2/, <tt/time-wait/, -<tt/closed/, <tt/close-wait/, <tt/last-ack/, <tt/listen/ and <tt/closing/. - -<item><tt/all/ - for all the states -<item><tt/connected/ - all the states except for <tt/listen/ and <tt/closed/ -<item><tt/synchronized/ - all the <tt/connected/ states except for -<tt/syn-sent/ -<item><tt/bucket/ - states, which are maintained as minisockets, i.e. -<tt/time-wait/ and <tt/syn-recv/. -<item><tt/big/ - opposite to <tt/bucket/ -</itemize> - -<sect2><tt/ADDRESS_FILTER/ - -<p><tt/ADDRESS_FILTER/ is boolean expression with operations <tt/and/, <tt/or/ -and <tt/not/, which can be abbreviated in C style f.e. as <tt/&/, -<tt/&&/. - -<p> -Predicates check socket addresses, both local and remote. -There are the following kinds of predicates: - -<itemize> -<item> <tt/dst ADDRESS_PATTERN/ - matches remote address and port -<item> <tt/src ADDRESS_PATTERN/ - matches local address and port -<item> <tt/dport RELOP PORT/ - compares remote port to a number -<item> <tt/sport RELOP PORT/ - compares local port to a number -<item> <tt/autobound/ - checks that socket is bound to an ephemeral - port -</itemize> - -<p><tt/RELOP/ is some of <tt/<=/, <tt/>=/, <tt/==/ etc. -To make this more convinient for use in unix shell, alphabetic -FORTRAN-like notations <tt/le/, <tt/gt/ etc. are accepted as well. - -<p>The format and semantics of <tt/ADDRESS_PATTERN/ depends on address -family. - -<itemize> -<item><tt/inet/ - <tt/ADDRESS_PATTERN/ consists of IP prefix, optionally -followed by colon and port. If prefix or port part is absent or replaced -with <tt/*/, this means wildcard match. -<item><tt/inet6/ - The same as <tt/inet/, only prefix refers to an IPv6 -address. Unlike <tt/inet/ colon becomes ambiguous, so that <tt/ss/ allows -to use scheme, like used in URLs, where address is suppounded with -<tt/[/ ... <tt/]/. -<item><tt/unix/ - <tt/ADDRESS_PATTERN/ is shell-style wildcard. -<item><tt/packet/ - format looks like <tt/inet/, only interface index -stays instead of port and link layer protocol id instead of address. -<item><tt/netlink/ - format looks like <tt/inet/, only socket pid -stays instead of port and netlink channel instead of address. -</itemize> - -<p><tt/PORT/ is syntactically <tt/ADDRESS_PATTERN/ with wildcard -address part. Certainly, it is undefined for UNIX sockets. - -<sect1> Environment variables - -<p> -<tt/ss/ allows to change source of information using various -environment variables: - -<p> -<itemize> -<item> <tt/PROC_SLABINFO/ to override <tt>/proc/slabinfo</tt> -<item> <tt/PROC_NET_TCP/ to override <tt>/proc/net/tcp</tt> -<item> <tt/PROC_NET_UDP/ to override <tt>/proc/net/udp</tt> -<item> etc. -</itemize> - -<p> -Variable <tt/PROC_ROOT/ allows to change root of all the <tt>/proc/</tt> -hierarchy. - -<p> -Variable <tt/TCPDIAG_FILE/ prescribes to open a file instead of -requesting kernel to dump information about TCP sockets. - - -<p> This option is used mainly to investigate bug reports, -when dumps of files usually found in <tt>/proc/</tt> are recevied -by e-mail. - -<sect1> Output format - -<p>Six columns. The first is <tt/Netid/, it denotes socket type and -transport protocol, when it is ambiguous: <tt/tcp/, <tt/udp/, <tt/raw/, -<tt/u_str/ is abbreviation for <tt/unix_stream/, <tt/u_dgr/ for UNIX -datagram sockets, <tt/nl/ for netlink, <tt/p_raw/ and <tt/p_dgr/ for -raw and datagram packet sockets. This column is optional, it will -be hidden, if filter selects an unique netid. - -<p> -The second column is <tt/State/. Socket state is displayed here. -The names are standard TCP names, except for <tt/UNCONN/, which -cannot happen for TCP, but normal for not connected sockets -of another types. Again, this column can be hidden. - -<p> -Then two columns (<tt/Recv-Q/ and <tt/Send-Q/) showing amount of data -queued for receive and transmit. - -<p> -And the last two columns display local address and port of the socket -and its peer address, if the socket is connected. - -<p> -If options <tt/-o/, <tt/-e/ or <tt/-p/ were given, options are -displayed not in fixed positions but separated by spaces pairs: -<tt/option:value/. If value is not a single number, it is presented -as list of values, enclosed to <tt/(/ ... <tt/)/ and separated with -commas. F.e. - -<tscreen><verb> - timer:(keepalive,111min,0) -</verb></tscreen> -is typical format for TCP timer (option <tt/-o/). - -<tscreen><verb> - users:((X,113,3)) -</verb></tscreen> -is typical for list of users (option <tt/-p/). - - -<sect>Some numbers - -<p> -Well, let us use <tt/pidentd/ and a tool <tt/ibench/ to measure -its performance. It is 30 requests per second here. Nothing to test, -it is too slow. OK, let us patch pidentd with patch from directory -Patches. After this it handles about 4300 requests per second -and becomes handy tool to pollute socket tables with lots of timewait -buckets. - -<p> -So, each test starts from pollution tables with 30000 sockets -and then doing full dump of the table piped to wc and measuring -timings with time: - -<p>Results: - -<itemize> -<item> <tt/netstat -at/ - 15.6 seconds -<item> <tt/ss -atr/, but without <tt/tcp_diag/ - 5.4 seconds -<item> <tt/ss -atr/ with <tt/tcp_diag/ - 0.47 seconds -</itemize> - -No comments. Though one comment is necessary, most of time -without <tt/tcp_diag/ is wasted inside kernel with completely -blocked networking. More than 10 seconds, yes. <tt/tcp_diag/ -does the same work for 100 milliseconds of system time. - -</article> diff --git a/doc/tc-filters.tex b/doc/tc-filters.tex deleted file mode 100644 index 54cc0c99..00000000 --- a/doc/tc-filters.tex +++ /dev/null @@ -1,514 +0,0 @@ -\documentclass[12pt,twoside]{article} - -\usepackage[hidelinks]{hyperref} % \url -\usepackage{booktabs} % nicer tabulars -\usepackage{fancyvrb} -\usepackage{fullpage} -\usepackage{float} - -\newcommand{\iface}{\textit} -\newcommand{\cmd}{\texttt} -\newcommand{\man}{\textit} -\newcommand{\qdisc}{\texttt} -\newcommand{\filter}{\texttt} - -\begin{document} -\title{QoS in Linux with TC and Filters} -\author{Phil Sutter (phil@nwl.cc)} -\date{January 2016} -\maketitle - -Standard practice when transmitting packets over a medium which may block (due -to congestion, e.g.) is to use a queue which temporarily holds these packets. In -Linux, this queueing approach is where QoS happens: A Queueing Discipline -(qdisc) holds multiple packet queues with different priorities for dequeueing to -the network driver. The classification (i.e. deciding which queue a packet -should go into) is typically done based on Type Of Service (IPv4) or Traffic -Class (IPv6) header fields but depending on qdisc implementation, might be -controlled by the user as well. - -Qdiscs come in two flavors, classful or classless. While classless qdiscs are -not as flexible as classful ones, they also require much less customizing. Often -it is enough to just attach them to an interface, without exact knowledge of -what is done internally. Classful qdiscs are the exact opposite: flexible in -application, they are often not even usable without insightful configuration. - -As the name implies, classful qdiscs provide configurable classes to sort -traffic into. In it's basic form, this is not much different than, say, the -classless \qdisc{pfifo\_fast} which holds three queues and classifies per -packet upon priority field. Though typically classes go beyond that by -supporting nesting and additional characteristics like e.g. maximum traffic -rate or quantum. - -When it comes to controlling the classification process, filters come into play. -They attach to the parent of a set of classes (i.e. either the qdisc itself or -a parent class) and specify how a packet (or it's associated flow) has to look -like in order to suit a given class. To overcome this simplification, it is -possible to attach multiple filters to the same parent, which then consults each -of them in row until the first one accepts the packet. - -Before getting into detail about what filters there are and how to use them, a -simple setup of a qdisc with classes is necessary: -\begin{figure}[H] -\begin{Verbatim} - .-------------------------------------------------------. - | | - | HTB | - | | - | .----------------------------------------------------.| - | | || - | | Class 1:1 || - | | || - | | .---------------..---------------..---------------.|| - | | | || || ||| - | | | Class 1:10 || Class 1:20 || Class 1:30 ||| - | | | || || ||| - | | | .------------.|| .------------.|| .------------.||| - | | | | ||| | ||| | |||| - | | | | fq_codel ||| | fq_codel ||| | fq_codel |||| - | | | | ||| | ||| | |||| - | | | '------------'|| '------------'|| '------------'||| - | | '---------------''---------------''---------------'|| - | '----------------------------------------------------'| - '-------------------------------------------------------' -\end{Verbatim} -\end{figure} -\noindent -The following commands establish the basic setup shown: -\begin{Verbatim} -(1) # tc qdisc replace dev eth0 root handle 1: htb default 30 -(2) # tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit -(3) # alias tclass='tc class add dev eth0 parent 1:1' -(4) # tclass classid 1:10 htb rate 1mbit ceil 20mbit prio 1 -(4) # tclass classid 1:20 htb rate 90mbit ceil 95mbit prio 2 -(4) # tclass classid 1:30 htb rate 1mbit ceil 95mbit prio 3 -(5) # tc qdisc add dev eth0 parent 1:10 fq_codel -(5) # tc qdisc add dev eth0 parent 1:20 fq_codel -(5) # tc qdisc add dev eth0 parent 1:30 fq_codel -\end{Verbatim} -A little explanation for the unfamiliar reader: -\begin{enumerate} -\item Replace the root qdisc of \iface{eth0} by an instance of \qdisc{HTB}. - Specifying the handle is necessary so it can be referenced in consecutive - calls to \cmd{tc}. The default class for unclassified traffic is set to - 30. -\item Create a single top-level class with handle 1:1 which limits the total - bandwidth allowed to 95mbit/s. It is assumed that \iface{eth0} is a 100mbit/s link, - staying a little below that helps to keep the main point of enqueueing in - the qdisc layer instead of the interface hardware queue or at another - bottleneck in the network. -\item Define an alias for the common part of the remaining three calls in order - to improve readability. This means all remaining classes are attached to the - common parent class from (2). -\item Create three child classes for different uses: Class 1:10 has highest - priority but is tightly limited in bandwidth - fine for interactive - connections. Class 1:20 has mid priority and high guaranteed bandwidth, for - high priority bulk traffic. Finally, there's the default class 1:30 with - lowest priority, low guaranteed bandwidth and the ability to use the full - link in case it's unused otherwise. This should be fine for uninteresting - traffic not explicitly taken care of. -\item Attach a leaf qdisc to each of the child classes created in (4). Since - \qdisc{HTB} by default attaches \qdisc{pfifo} as leaf qdisc, this step is optional. Still, - the fairness between different flows provided by the classless \qdisc{fq\_codel} is - worth the effort. -\end{enumerate} -More information about the qdiscs and fine-tuning parameters can be found in -\man{tc-htb(8)} and \man{tc-fq\_codel(8)}. - -Without any additional setup done, now all traffic leaving \iface{eth0} is shaped to -95mbit/s and directed through class 1:30. This can be verified by looking at the -\texttt{Sent} field of the class statistics printed via \cmd{tc -s class show dev eth0}: -Only the root class 1:1 and it's child 1:30 should show any traffic. - - -\section*{Finally time to start filtering!} - -Let's begin with a simple one, i.e. reestablishing what \qdisc{pfifo\_fast} did -automatically based on TOS/Priority field. Linux internally translates the -header field into the priority field of struct skbuff, which -\qdisc{pfifo\_fast} uses for -classification. \man{tc-prio(8)} contains a table listing the priority (and -ultimately, \qdisc{pfifo\_fast} queue index) each TOS value is being translated into. -Here is a shorter version: -\begin{center} -\begin{tabular}{lll} -TOS Values & Linux Priority (Number) & Queue Index \\ -\midrule -0x0 - 0x6 & Best Effort (0) & 1 \\ -0x8 - 0xe & Bulk (2) & 2 \\ -0x10 - 0x16 & Interactive (6) & 0 \\ -0x18 - 0x1e & Interactive Bulk (4) & 1 \\ -\end{tabular} -\end{center} -Using the \filter{basic} filter, it is possible to match packets based on that skbuff -field, which has the added benefit of being IP version agnostic. Since the -\qdisc{HTB} setup above defaults to class ID 1:30, the Bulk priority can be -ignored. The \filter{basic} filter allows to combine matches, therefore we get along -with only two filters: -\begin{Verbatim} -# tc filter add dev eth0 parent 1: basic \ - match 'meta(priority eq 6)' classid 1:10 -# tc filter add dev eth0 parent 1: basic \ - match 'meta(priority eq 0)' \ - or 'meta(priority eq 4)' classid 1:20 -\end{Verbatim} -A detailed description of the \filter{basic} filter and the ematch syntax it uses can be -found in \man{tc-basic(8)} and \man{tc-ematch(8)}. - -Obviously, this first example cries for optimization. A simple one would be to -just change the default class from 1:30 to 1:20, so filters are only needed for -Bulk and Interactive priorities: -\begin{Verbatim} -# tc filter add dev eth0 parent 1: basic \ - match 'meta(priority eq 6)' classid 1:10 -# tc filter add dev eth0 parent 1: basic \ - match 'meta(priority eq 2)' classid 1:20 -\end{Verbatim} -Given that class IDs are random, choosing them wisely allows for a direct -mapping. So first, recreate the qdisc and classes configuration: -\begin{Verbatim} -# tc qdisc replace dev eth0 root handle 1: htb default 10 -# tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit -# alias tclass='tc class add dev eth0 parent 1:1' -# tclass classid 1:16 htb rate 1mbit ceil 20mbit prio 1 -# tclass classid 1:10 htb rate 90mbit ceil 95mbit prio 2 -# tclass classid 1:12 htb rate 1mbit ceil 95mbit prio 3 -# tc qdisc add dev eth0 parent 1:16 fq_codel -# tc qdisc add dev eth0 parent 1:10 fq_codel -# tc qdisc add dev eth0 parent 1:12 fq_codel -\end{Verbatim} -This is basically identical to above, but with changed leaf class IDs and the -second priority class being the default. Using the \filter{flow} filter with it's \texttt{map} -functionality, a single filter command is enough: -\begin{Verbatim} -# tc filter add dev eth0 parent 1: handle 0x1337 flow \ - map key priority baseclass 1:10 -\end{Verbatim} -The \filter{flow} filter now uses the priority value to construct a destination class ID -by adding it to the value of \texttt{baseclass}. While this works for priority values of -0, 2 and 6, it will result in non-existent class ID 1:14 for Interactive Bulk -traffic. In that case, the \qdisc{HTB} default applies so that traffic goes into class -ID 1:10 just as intended. Please note that specifying a handle is a mandatory -requirement by the \filter{flow} filter, although I didn't see where one would use that -later. For more information about \filter{flow}, see \man{tc-flow(8)}. - -While \filter{flow} and \filter{basic} filters are relatively easy to apply and understand, they -are as well quite limited to their intended purpose. A more flexible option is -the \filter{u32} filter, which allows to match on arbitrary parts of the packet data - -yet only on that, not any meta data associated to it by the kernel (with the -exception of firewall mark value). So in order to continue this little -exercise with \filter{u32}, we have to base classification directly upon the actual TOS -value. An intuitive attempt might look like this: -\begin{Verbatim} -# alias tcfilter='tc filter add dev eth0 parent 1:' -# tcfilter u32 match ip dsfield 0x10 0x1e classid 1:16 -# tcfilter u32 match ip dsfield 0x12 0x1e classid 1:16 -# tcfilter u32 match ip dsfield 0x14 0x1e classid 1:16 -# tcfilter u32 match ip dsfield 0x16 0x1e classid 1:16 -# tcfilter u32 match ip dsfield 0x8 0x1e classid 1:12 -# tcfilter u32 match ip dsfield 0xa 0x1e classid 1:12 -# tcfilter u32 match ip dsfield 0xc 0x1e classid 1:12 -# tcfilter u32 match ip dsfield 0xe 0x1e classid 1:12 -\end{Verbatim} -The obvious drawback here is the amount of filters needed. And without the -default class, eight more filters would be necessary. This also has performance -implications: A packet with TOS value 0xe will be checked eight times in total -in order to determine it's destination class. While there's not much to be done -about the number of filters, at least the performance problem can be eliminated -by using \filter{u32}'s hash table support: -\begin{Verbatim} -# tc filter add dev eth0 parent 1: prio 99 handle 1: u32 divisor 16 -\end{Verbatim} -This creates a hash table with 16 buckets. The table size is arbitrary, but not -random: Since the first bit of the TOS field is not interesting, it can be -ignored and therefore the range of values to consider is just [0;15], i.e. a -number of 16 different values. The next step is to populate the hash table: -\begin{Verbatim} -# alias tcfilter='tc filter add dev eth0 parent 1: prio 99' -# tcfilter u32 match u8 0 0 ht 1:0: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:1: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:2: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:3: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:4: classid 1:12 -# tcfilter u32 match u8 0 0 ht 1:5: classid 1:12 -# tcfilter u32 match u8 0 0 ht 1:6: classid 1:12 -# tcfilter u32 match u8 0 0 ht 1:7: classid 1:12 -# tcfilter u32 match u8 0 0 ht 1:8: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:9: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:a: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:b: classid 1:16 -# tcfilter u32 match u8 0 0 ht 1:c: classid 1:10 -# tcfilter u32 match u8 0 0 ht 1:d: classid 1:10 -# tcfilter u32 match u8 0 0 ht 1:e: classid 1:10 -# tcfilter u32 match u8 0 0 ht 1:f: classid 1:10 -\end{Verbatim} -The parameter \texttt{ht} denotes the hash table and bucket the filter should be added -to. Since the first TOS bit is ignored, it's value has to be divided by two in -order to get to the bucket it maps to. E.g. a TOS value of 0x10 will therefore -map to bucket 0x8. For the sake of completeness, all possible values are mapped -and therefore a configurable default class is not required. Note that the used -match expression is not necessary, but mandatory. Therefore anything that -matches any packet will suffice. Finally, a filter which links to the defined -hash table is needed: -\begin{Verbatim} -# tc filter add dev eth0 parent 1: prio 1 protocol ip u32 \ - link 1: hashkey mask 0x001e0000 match u8 0 0 -\end{Verbatim} -Here again, the actual match statement is not necessary, but syntactically -required. All the magic lies within the \texttt{hashkey} parameter, which defines which -part of the packet should be used directly as hash key. Here's a drawing of the -first four bytes of the IPv4 header, with the area selected by \texttt{hashkey mask} -highlighted: -\begin{figure}[H] -\begin{Verbatim} - 0 1 2 3 - .-----------------------------------------------------------------. - | | | ######## | | | - | Version| IHL | #DSCP### | ECN| Total Length | - | | | ######## | | | - '-----------------------------------------------------------------' -\end{Verbatim} -\end{figure} -\noindent -This may look confusing at first, but keep in mind that bit- as well as -byte-ordering here is LSB while the mask value is written in MSB we humans use. -Therefore reading the mask is done like so, starting from left: -\begin{enumerate} -\item Skip the first byte (which contains Version and IHL fields). -\item Skip the lowest bit of the second byte (0x1e is even). -\item Mark the four following bits (0x1e is 11110 in binary). -\item Skip the remaining three bits of the second byte as well as the remaining two - bytes. -\end{enumerate} -Before doing the lookup, the kernel right-shifts the masked value by the amount -of zero-bits in \texttt{mask}, which implicitly also does the division by two which the -hash table depends on. With this setup, every packet has to pass exactly two -filters to be classified. Note that this filter is limited to IPv4 packets: Due -to the related Traffic Class field being at a different offset in the packet, it -would not work for IPv6. To use the same setup for IPv6 as well, a second -entry-level filter is necessary: -\begin{Verbatim} -# tc filter add dev eth0 parent 1: prio 2 protocol ipv6 u32 \ - link 1: hashkey mask 0x01e00000 match u8 0 0 -\end{Verbatim} -For illustration purposes, here again is a drawing of the first four bytes of -the IPv6 header, again with masked area highlighted: -\begin{figure}[H] -\begin{Verbatim} - 0 1 2 3 - .-----------------------------------------------------------------. - | | ######## | | - | Version| #Traffic Class| Flow Label | - | | ######## | | - '-----------------------------------------------------------------' -\end{Verbatim} -\end{figure} -\noindent -Reading the mask value is analogous to IPv4 with the added complexity that -Traffic Class spans over two bytes. Yet, for comparison there's a simple trick: -IPv6 has the interesting field shifted by four bits to the left, and the new -mask's value is shifted by the same amount. For further information about -\filter{u32} and what can be done with it, consult it's man page -\man{tc-u32(8)}. - -Of course, the kernel provides many more filters than just \filter{basic}, -\filter{flow} and \filter{u32} which have been presented above. As of now, the -remaining ones are: -\begin{description} -\item[bpf] - Filtering using Berkeley Packet Filter programs. The program's return - code determines the packet's destination class ID. - -\item[cgroup] - Filter packets based on control groups. This is only useful for packets - originating from the local host, as control groups only exist in that - scope. - -\item[flower] - An extended variant of the flow filter. - -\item[fw] - Matches on firewall mark values previously assigned to the packet by - netfilter (or a filter action, see below for details). This allows to - export the classification algorithm into netfilter, which is very - convenient if appropriate rules exist on the same system in there - already. - -\item[route] - Filter packets based on matching routing table entry. Basically - equivalent to the \texttt{fw} filter above, to make use of an already existing - extensive routing table setup. - -\item[rsvp, rsvp6] - Implementation of the Resource Reservation Protocol in Linux, to react - upon requests sent by an RSVP daemon. - -\item[tcindex] - Match packets based on tcindex value, which is usually set by the dsmark - qdisc. This is part of an approach to support Differentiated Services in - Linux, which is another topic on it's own. -\end{description} - - -\section*{Filter Actions} - -The tc filter framework provides the infrastructure to another extensible set of -tools as well, namely tc actions. As the name suggests, they allow to do things -with packets (or associated data). (The list of) Actions are part of a given -filter. If it matches, each action it contains is executed in order before -returning the classification result. Since the action has direct access to the -latter, it is in theory possible for an action to react upon or even change the -filtering result - as long as the packet matched, of course. Yet none of the -currently in-tree actions make use of this. - -The Generic Actions framework originally evolved out of the filters' ability to -police traffic to a given maximum bandwidth. One common use case for that is to -limit ingress traffic, dropping packets which exceed the threshold. A classic -setup example is like so: -\begin{Verbatim} -# tc qdisc add dev eth0 handle ffff: ingress -# tc filter add dev eth0 parent ffff: u32 \ - match u32 0 0 - police rate 1mbit burst 100k -\end{Verbatim} -The ingress qdisc is not a real one, but merely a point of reference for filters -to attach to which should get applied to incoming traffic. The \filter{u32} filter added -above matches on any packet and therefore limits the total incoming bandwidth to -1mbit/s, allowing bursts of up to 100kbytes. Using the new syntax, the filter -command changes slightly: -\begin{Verbatim} -# tc filter add dev eth0 parent ffff: u32 \ - match u32 0 0 \ - action police rate 1mbit burst 100k -\end{Verbatim} -The important detail is that this syntax allows to define multiple actions. -E.g. for testing purposes, it is possible to redirect exceeding traffic to the -loopback interface instead of dropping it: -\begin{Verbatim} -# tc filter add dev eth0 parent ffff: u32 \ - match u32 0 0 \ - action police rate 1mbit burst 100k conform-exceed pipe \ - action mirred egress redirect dev lo -\end{Verbatim} -The added parameter \texttt{conform-exceed pipe} tells the police action to allow for -further actions to handle the exceeding packet. - -Apart from \texttt{police} and \texttt{mirred} actions, there are a few more. Here's a full -list of the currently implemented ones: -\begin{description} -\item[bpf] - Apply a Berkeley Packet Filter program to the packet. - -\item[connmark] - Set the packet's firewall mark to that of it's connection. This works by - searching the conntrack table for a matching entry. If found, the mark - is restored. - -\item[csum] - Trigger recalculation of packet checksums. The supported protocols are: - IPv4, ICMP, IGMP, TCP, UDP and UDPLite. - -\item[ipt] - Pass the packet to an iptables target. This allows to use iptables - extensions directly instead of having to go the extra mile via setting - an arbitrary firewall mark and matching on that from within netfilter. - -\item[mirred] - Mirror or redirect packets. This is often combined with the ifb pseudo - device to share a common QoS setup between multiple interfaces or even - ingress traffic. - -\item[nat] - Perform stateless Native Address Translation. This is certainly not - complete and therefore inferior to NAT using iptables: Although the - kernel module decides between TCP, UDP and ICMP traffic, it does not - handle typical problematic protocols such as active FTP or SIP. - -\item[pedit] - Generic packet editing. This allows to alter arbitrary bytes of the - packet, either by specifying an offset into the packet or by naming a - packet header and field name to change. Currently, the latter is - implemented only for IPv4 yet. - -\item[police] - Apply a bandwidth rate limiting policy. Packets exceeding it are dropped - by default, but may optionally be handled differently. - -\item[simple] - This is rather an example than real action. All it does is print a - user-defined string together with a packet counter. Useful maybe for - debugging when filter statistics are not available or too complicated. - -\item[skbedit] - Edit associated packet data, supports changing queue mapping, priority - field and firewall mark value. - -\item[vlan] - Add/remove a VLAN header to/from the packet. This might serve as - alternative to using 802.1Q pseudo-interfaces in combination with - routing rules when e.g. packets for a given destination need to be - encapsulated. -\end{description} - - -\section*{Intermediate Functional Block} - -The Intermediate Functional Block (\texttt{ifb}) pseudo network interface acts as a QoS -concentrator for multiple different sources of traffic. Packets from or to other -interfaces have to be redirected to it using the \texttt{mirred} action in order to be -handled, regularly routed traffic will be dropped. This way, a single stack of -qdiscs, classes and filters can be shared between multiple interfaces. - -Here's a simple example to feed incoming traffic from multiple interfaces -through a Stochastic Fairness Queue (\qdisc{sfq}): -\begin{Verbatim} -(1) # modprobe ifb -(2) # ip link set ifb0 up -(3) # tc qdisc add dev ifb0 root sfq -\end{Verbatim} -The first step is to load the \texttt{ifb} kernel module (1). By default, this will -create two ifb devices: \iface{ifb0} and \iface{ifb1}. After setting -\iface{ifb0} up in (2), the root -qdisc is replaced by \qdisc{sfq} in (3). Finally, one can start redirecting ingress -traffic to \iface{ifb0}, e.g. from \iface{eth0}: -\begin{Verbatim} -# tc qdisc add dev eth0 handle ffff: ingress -# tc filter add dev eth0 parent ffff: u32 \ - match u32 0 0 \ - action mirred egress redirect dev ifb0 -\end{Verbatim} -The same can be done for other interfaces, just replacing \iface{eth0} in the two -commands above. One thing to keep in mind here is the asymmetrical routing this -creates within the host doing the QoS: Incoming packets enter the system via -\iface{ifb0}, while corresponding replies leave directly via \iface{eth0}. This can be observed -using \cmd{tcpdump} on \iface{ifb0}, which shows the input part of the traffic only. What's -more confusing is that \cmd{tcpdump} on \iface{eth0} shows both incoming and outgoing traffic, -but the redirection is still effective - a simple prove is setting -\iface{ifb0} down, -which will interrupt the communication. Obviously \cmd{tcpdump} catches the packets to -dump before they enter the ingress qdisc, which is why it sees them while the -kernel itself doesn't. - - -\section*{Conclusion} - -Once the steep learning curve has been mastered, the conglomerate of (classful) -qdiscs, filters and actions provides a highly sophisticated and flexible -infrastructure to perform QoS, which plays nicely along with routing and -firewalling setups. - - -\section*{Further Reading} - -A good starting point for novice users and experienced ones diving into unknown -areas is the extensive HOWTO at \url{http://lartc.org}. The iproute2 package ships -some examples (usually in /usr/share/doc/, depending on distribution) as well as -man pages for \cmd{tc} in general, qdiscs and filters. The latter have been added -just recently though, so if your distribution does not ship iproute2 version -4.3.0 yet, these are not in there. Apart from that, the internet is a spring of -HOWTOs and scripts people wrote - though these should be taken with a grain of -salt: The complexity of the matter often leads to copying others' solutions -without much validation, which allows for less optimal or even obsolete -implementations to survive much longer than desired. - -\end{document} diff --git a/include/color.h b/include/color.h index 1cd6f7d2..7fd685d0 100644 --- a/include/color.h +++ b/include/color.h @@ -2,14 +2,13 @@ #define __COLOR_H__ 1 enum color_attr { - COLOR_NONE, COLOR_IFNAME, COLOR_MAC, COLOR_INET, COLOR_INET6, COLOR_OPERSTATE_UP, COLOR_OPERSTATE_DOWN, - COLOR_CLEAR + COLOR_NONE }; void enable_color(void); diff --git a/include/json_print.h b/include/json_print.h index b6ce1f9f..dc4d2bb3 100644 --- a/include/json_print.h +++ b/include/json_print.h @@ -53,7 +53,7 @@ void close_json_array(enum output_type type, const char *delim); const char *fmt, \ type value) \ { \ - print_color_##type_name(t, -1, key, fmt, value); \ + print_color_##type_name(t, COLOR_NONE, key, fmt, value); \ } _PRINT_FUNC(int, int); _PRINT_FUNC(bool, bool); diff --git a/include/linux/atm.h b/include/uapi/linux/atm.h index 08e27beb..08e27beb 100644 --- a/include/linux/atm.h +++ b/include/uapi/linux/atm.h diff --git a/include/linux/atmapi.h b/include/uapi/linux/atmapi.h index 8fe54d90..8fe54d90 100644 --- a/include/linux/atmapi.h +++ b/include/uapi/linux/atmapi.h diff --git a/include/linux/atmarp.h b/include/uapi/linux/atmarp.h index 231f4bde..231f4bde 100644 --- a/include/linux/atmarp.h +++ b/include/uapi/linux/atmarp.h diff --git a/include/linux/atmdev.h b/include/uapi/linux/atmdev.h index 8faa8b94..8faa8b94 100644 --- a/include/linux/atmdev.h +++ b/include/uapi/linux/atmdev.h diff --git a/include/linux/atmioc.h b/include/uapi/linux/atmioc.h index 37f67aa8..37f67aa8 100644 --- a/include/linux/atmioc.h +++ b/include/uapi/linux/atmioc.h diff --git a/include/linux/atmsap.h b/include/uapi/linux/atmsap.h index 799b1045..799b1045 100644 --- a/include/linux/atmsap.h +++ b/include/uapi/linux/atmsap.h diff --git a/include/linux/bpf.h b/include/uapi/linux/bpf.h index 0895a529..0895a529 100644 --- a/include/linux/bpf.h +++ b/include/uapi/linux/bpf.h diff --git a/include/linux/bpf_common.h b/include/uapi/linux/bpf_common.h index afe7433b..afe7433b 100644 --- a/include/linux/bpf_common.h +++ b/include/uapi/linux/bpf_common.h diff --git a/include/linux/can.h b/include/uapi/linux/can.h index f7a810de..f7a810de 100644 --- a/include/linux/can.h +++ b/include/uapi/linux/can.h diff --git a/include/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index b9214bd7..b9214bd7 100644 --- a/include/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h diff --git a/include/linux/can/vxcan.h b/include/uapi/linux/can/vxcan.h index 5b29e8a7..5b29e8a7 100644 --- a/include/linux/can/vxcan.h +++ b/include/uapi/linux/can/vxcan.h diff --git a/include/linux/devlink.h b/include/uapi/linux/devlink.h index a62695e2..a62695e2 100644 --- a/include/linux/devlink.h +++ b/include/uapi/linux/devlink.h diff --git a/include/linux/elf-em.h b/include/uapi/linux/elf-em.h index 9cd1de95..9cd1de95 100644 --- a/include/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h diff --git a/include/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index bbf02a63..bbf02a63 100644 --- a/include/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h diff --git a/include/linux/filter.h b/include/uapi/linux/filter.h index e4f2f74c..e4f2f74c 100644 --- a/include/linux/filter.h +++ b/include/uapi/linux/filter.h diff --git a/include/linux/fou.h b/include/uapi/linux/fou.h index 744c3238..744c3238 100644 --- a/include/linux/fou.h +++ b/include/uapi/linux/fou.h diff --git a/include/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 52deccc2..52deccc2 100644 --- a/include/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h diff --git a/include/linux/genetlink.h b/include/uapi/linux/genetlink.h index 08239d8e..08239d8e 100644 --- a/include/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h diff --git a/include/linux/hdlc/ioctl.h b/include/uapi/linux/hdlc/ioctl.h index 04bc0274..04bc0274 100644 --- a/include/linux/hdlc/ioctl.h +++ b/include/uapi/linux/hdlc/ioctl.h diff --git a/include/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index a2e839ee..a2e839ee 100644 --- a/include/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h diff --git a/include/linux/if.h b/include/uapi/linux/if.h index b4ba0207..b4ba0207 100644 --- a/include/linux/if.h +++ b/include/uapi/linux/if.h diff --git a/include/linux/if_addr.h b/include/uapi/linux/if_addr.h index 26f0ecff..26f0ecff 100644 --- a/include/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h diff --git a/include/linux/if_addrlabel.h b/include/uapi/linux/if_addrlabel.h index 54580c29..54580c29 100644 --- a/include/linux/if_addrlabel.h +++ b/include/uapi/linux/if_addrlabel.h diff --git a/include/linux/if_alg.h b/include/uapi/linux/if_alg.h index f2acd2fd..f2acd2fd 100644 --- a/include/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h diff --git a/include/linux/if_arp.h b/include/uapi/linux/if_arp.h index 199f253b..199f253b 100644 --- a/include/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h diff --git a/include/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index 9635a62f..9635a62f 100644 --- a/include/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h diff --git a/include/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 156f4434..156f4434 100644 --- a/include/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h diff --git a/include/linux/if_ether.h b/include/uapi/linux/if_ether.h index 7dde037a..7dde037a 100644 --- a/include/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h diff --git a/include/linux/if_link.h b/include/uapi/linux/if_link.h index 1f97d056..1f97d056 100644 --- a/include/linux/if_link.h +++ b/include/uapi/linux/if_link.h diff --git a/include/linux/if_macsec.h b/include/uapi/linux/if_macsec.h index 22939a3e..22939a3e 100644 --- a/include/linux/if_macsec.h +++ b/include/uapi/linux/if_macsec.h diff --git a/include/linux/if_packet.h b/include/uapi/linux/if_packet.h index 4df96a7d..4df96a7d 100644 --- a/include/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h diff --git a/include/linux/if_tun.h b/include/uapi/linux/if_tun.h index d5ecb425..d5ecb425 100644 --- a/include/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h diff --git a/include/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 21834cac..21834cac 100644 --- a/include/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h diff --git a/include/linux/if_vlan.h b/include/uapi/linux/if_vlan.h index 24ae0071..24ae0071 100644 --- a/include/linux/if_vlan.h +++ b/include/uapi/linux/if_vlan.h diff --git a/include/linux/ife.h b/include/uapi/linux/ife.h index 2954da32..2954da32 100644 --- a/include/linux/ife.h +++ b/include/uapi/linux/ife.h diff --git a/include/linux/ila.h b/include/uapi/linux/ila.h index 7e328d72..7e328d72 100644 --- a/include/linux/ila.h +++ b/include/uapi/linux/ila.h diff --git a/include/linux/in.h b/include/uapi/linux/in.h index 9439efaa..9439efaa 100644 --- a/include/linux/in.h +++ b/include/uapi/linux/in.h diff --git a/include/linux/in6.h b/include/uapi/linux/in6.h index 6f3bdee7..6f3bdee7 100644 --- a/include/linux/in6.h +++ b/include/uapi/linux/in6.h diff --git a/include/linux/in_route.h b/include/uapi/linux/in_route.h index b261b8c9..b261b8c9 100644 --- a/include/linux/in_route.h +++ b/include/uapi/linux/in_route.h diff --git a/include/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bada4d7b..bada4d7b 100644 --- a/include/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h diff --git a/include/linux/ip.h b/include/uapi/linux/ip.h index 1907284c..1907284c 100644 --- a/include/linux/ip.h +++ b/include/uapi/linux/ip.h diff --git a/include/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 425926c4..425926c4 100644 --- a/include/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h diff --git a/include/linux/ipsec.h b/include/uapi/linux/ipsec.h index d17a6302..d17a6302 100644 --- a/include/linux/ipsec.h +++ b/include/uapi/linux/ipsec.h diff --git a/include/linux/kernel.h b/include/uapi/linux/kernel.h index 527549f5..527549f5 100644 --- a/include/linux/kernel.h +++ b/include/uapi/linux/kernel.h diff --git a/include/linux/l2tp.h b/include/uapi/linux/l2tp.h index 8a80007b..8a80007b 100644 --- a/include/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h diff --git a/include/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index f38571da..f38571da 100644 --- a/include/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h diff --git a/include/linux/limits.h b/include/uapi/linux/limits.h index 2d0f9416..2d0f9416 100644 --- a/include/linux/limits.h +++ b/include/uapi/linux/limits.h diff --git a/include/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 32984262..32984262 100644 --- a/include/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h diff --git a/include/linux/magic.h b/include/uapi/linux/magic.h index e439565d..e439565d 100644 --- a/include/linux/magic.h +++ b/include/uapi/linux/magic.h diff --git a/include/linux/mpls.h b/include/uapi/linux/mpls.h index bf5b6259..bf5b6259 100644 --- a/include/linux/mpls.h +++ b/include/uapi/linux/mpls.h diff --git a/include/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h index 1a0e57b4..1a0e57b4 100644 --- a/include/linux/mpls_iptunnel.h +++ b/include/uapi/linux/mpls_iptunnel.h diff --git a/include/linux/neighbour.h b/include/uapi/linux/neighbour.h index 3199d289..3199d289 100644 --- a/include/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h diff --git a/include/linux/net_namespace.h b/include/uapi/linux/net_namespace.h index 9a92b7e1..9a92b7e1 100644 --- a/include/linux/net_namespace.h +++ b/include/uapi/linux/net_namespace.h diff --git a/include/linux/netconf.h b/include/uapi/linux/netconf.h index 4afbd7db..4afbd7db 100644 --- a/include/linux/netconf.h +++ b/include/uapi/linux/netconf.h diff --git a/include/linux/netdevice.h b/include/uapi/linux/netdevice.h index 66fceb44..66fceb44 100644 --- a/include/linux/netdevice.h +++ b/include/uapi/linux/netdevice.h diff --git a/include/linux/netfilter.h b/include/uapi/linux/netfilter.h index ff4a4a52..ff4a4a52 100644 --- a/include/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index a6c96b00..a6c96b00 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h diff --git a/include/linux/netfilter/x_tables.h b/include/uapi/linux/netfilter/x_tables.h index 41209700..41209700 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/uapi/linux/netfilter/x_tables.h diff --git a/include/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h index d4e02348..d4e02348 100644 --- a/include/linux/netfilter/xt_set.h +++ b/include/uapi/linux/netfilter/xt_set.h diff --git a/include/linux/netfilter/xt_tcpudp.h b/include/uapi/linux/netfilter/xt_tcpudp.h index 38aa7b39..38aa7b39 100644 --- a/include/linux/netfilter/xt_tcpudp.h +++ b/include/uapi/linux/netfilter/xt_tcpudp.h diff --git a/include/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index a5f4dc78..a5f4dc78 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index 456fb863..456fb863 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h diff --git a/include/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h index 8483d1d4..8483d1d4 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/uapi/linux/netfilter_ipv6.h diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index fcc8ccaf..fcc8ccaf 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h diff --git a/include/linux/netlink.h b/include/uapi/linux/netlink.h index ec0690b5..ec0690b5 100644 --- a/include/linux/netlink.h +++ b/include/uapi/linux/netlink.h diff --git a/include/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index c8c8c7d2..c8c8c7d2 100644 --- a/include/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h diff --git a/include/linux/packet_diag.h b/include/uapi/linux/packet_diag.h index 0c5d5dd6..0c5d5dd6 100644 --- a/include/linux/packet_diag.h +++ b/include/uapi/linux/packet_diag.h diff --git a/include/linux/param.h b/include/uapi/linux/param.h index 092e92f6..092e92f6 100644 --- a/include/linux/param.h +++ b/include/uapi/linux/param.h diff --git a/include/linux/pfkeyv2.h b/include/uapi/linux/pfkeyv2.h index ada7f017..ada7f017 100644 --- a/include/linux/pfkeyv2.h +++ b/include/uapi/linux/pfkeyv2.h diff --git a/include/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index d5e2bf68..d5e2bf68 100644 --- a/include/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h diff --git a/include/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 099bf552..099bf552 100644 --- a/include/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h diff --git a/include/linux/posix_types.h b/include/uapi/linux/posix_types.h index 988f76e6..988f76e6 100644 --- a/include/linux/posix_types.h +++ b/include/uapi/linux/posix_types.h diff --git a/include/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 813e9e07..813e9e07 100644 --- a/include/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h diff --git a/include/linux/sctp.h b/include/uapi/linux/sctp.h index fec24c41..fec24c41 100644 --- a/include/linux/sctp.h +++ b/include/uapi/linux/sctp.h diff --git a/include/linux/seg6.h b/include/uapi/linux/seg6.h index 07152792..07152792 100644 --- a/include/linux/seg6.h +++ b/include/uapi/linux/seg6.h diff --git a/include/linux/seg6_genl.h b/include/uapi/linux/seg6_genl.h index 99382f94..99382f94 100644 --- a/include/linux/seg6_genl.h +++ b/include/uapi/linux/seg6_genl.h diff --git a/include/linux/seg6_hmac.h b/include/uapi/linux/seg6_hmac.h index 704f93e8..704f93e8 100644 --- a/include/linux/seg6_hmac.h +++ b/include/uapi/linux/seg6_hmac.h diff --git a/include/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index a5dc05a1..a5dc05a1 100644 --- a/include/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h diff --git a/include/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index 76b90d60..76b90d60 100644 --- a/include/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h diff --git a/include/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index 901231e6..901231e6 100644 --- a/include/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h diff --git a/include/linux/socket.h b/include/uapi/linux/socket.h index 8c1e5017..8c1e5017 100644 --- a/include/linux/socket.h +++ b/include/uapi/linux/socket.h diff --git a/include/linux/sockios.h b/include/uapi/linux/sockios.h index 79d029d2..79d029d2 100644 --- a/include/linux/sockios.h +++ b/include/uapi/linux/sockios.h diff --git a/include/linux/stddef.h b/include/uapi/linux/stddef.h index 4bb69dec..4bb69dec 100644 --- a/include/linux/stddef.h +++ b/include/uapi/linux/stddef.h diff --git a/include/linux/sysinfo.h b/include/uapi/linux/sysinfo.h index 934335a2..934335a2 100644 --- a/include/linux/sysinfo.h +++ b/include/uapi/linux/sysinfo.h diff --git a/include/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 8dc2ac05..8dc2ac05 100644 --- a/include/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h diff --git a/include/linux/tc_act/tc_connmark.h b/include/uapi/linux/tc_act/tc_connmark.h index 62a5e944..62a5e944 100644 --- a/include/linux/tc_act/tc_connmark.h +++ b/include/uapi/linux/tc_act/tc_connmark.h diff --git a/include/linux/tc_act/tc_csum.h b/include/uapi/linux/tc_act/tc_csum.h index a11bb355..a11bb355 100644 --- a/include/linux/tc_act/tc_csum.h +++ b/include/uapi/linux/tc_act/tc_csum.h diff --git a/include/linux/tc_act/tc_defact.h b/include/uapi/linux/tc_act/tc_defact.h index d2a3abb7..d2a3abb7 100644 --- a/include/linux/tc_act/tc_defact.h +++ b/include/uapi/linux/tc_act/tc_defact.h diff --git a/include/linux/tc_act/tc_gact.h b/include/uapi/linux/tc_act/tc_gact.h index 70b536a8..70b536a8 100644 --- a/include/linux/tc_act/tc_gact.h +++ b/include/uapi/linux/tc_act/tc_gact.h diff --git a/include/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h index 7c281786..7c281786 100644 --- a/include/linux/tc_act/tc_ife.h +++ b/include/uapi/linux/tc_act/tc_ife.h diff --git a/include/linux/tc_act/tc_ipt.h b/include/uapi/linux/tc_act/tc_ipt.h index 7c6e155d..7c6e155d 100644 --- a/include/linux/tc_act/tc_ipt.h +++ b/include/uapi/linux/tc_act/tc_ipt.h diff --git a/include/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h index 3d7a2b35..3d7a2b35 100644 --- a/include/linux/tc_act/tc_mirred.h +++ b/include/uapi/linux/tc_act/tc_mirred.h diff --git a/include/linux/tc_act/tc_nat.h b/include/uapi/linux/tc_act/tc_nat.h index 923457c9..923457c9 100644 --- a/include/linux/tc_act/tc_nat.h +++ b/include/uapi/linux/tc_act/tc_nat.h diff --git a/include/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h index 143d2b31..143d2b31 100644 --- a/include/linux/tc_act/tc_pedit.h +++ b/include/uapi/linux/tc_act/tc_pedit.h diff --git a/include/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h index edc9058b..edc9058b 100644 --- a/include/linux/tc_act/tc_sample.h +++ b/include/uapi/linux/tc_act/tc_sample.h diff --git a/include/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index 28844257..28844257 100644 --- a/include/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h diff --git a/include/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h index 10fc07da..10fc07da 100644 --- a/include/linux/tc_act/tc_skbmod.h +++ b/include/uapi/linux/tc_act/tc_skbmod.h diff --git a/include/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index afcd4be9..afcd4be9 100644 --- a/include/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h diff --git a/include/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index bddb272b..bddb272b 100644 --- a/include/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h diff --git a/include/linux/tc_ematch/tc_em_cmp.h b/include/uapi/linux/tc_ematch/tc_em_cmp.h index f34bb1ba..f34bb1ba 100644 --- a/include/linux/tc_ematch/tc_em_cmp.h +++ b/include/uapi/linux/tc_ematch/tc_em_cmp.h diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/uapi/linux/tc_ematch/tc_em_meta.h index b11f8ce2..b11f8ce2 100644 --- a/include/linux/tc_ematch/tc_em_meta.h +++ b/include/uapi/linux/tc_ematch/tc_em_meta.h diff --git a/include/linux/tc_ematch/tc_em_nbyte.h b/include/uapi/linux/tc_ematch/tc_em_nbyte.h index 7172cfb9..7172cfb9 100644 --- a/include/linux/tc_ematch/tc_em_nbyte.h +++ b/include/uapi/linux/tc_ematch/tc_em_nbyte.h diff --git a/include/linux/tcp.h b/include/uapi/linux/tcp.h index 8edad3f9..8edad3f9 100644 --- a/include/linux/tcp.h +++ b/include/uapi/linux/tcp.h diff --git a/include/linux/tcp_metrics.h b/include/uapi/linux/tcp_metrics.h index 80ad90d0..80ad90d0 100644 --- a/include/linux/tcp_metrics.h +++ b/include/uapi/linux/tcp_metrics.h diff --git a/include/linux/tipc.h b/include/uapi/linux/tipc.h index 924fb5cf..924fb5cf 100644 --- a/include/linux/tipc.h +++ b/include/uapi/linux/tipc.h diff --git a/include/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index f9edd20f..f9edd20f 100644 --- a/include/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h diff --git a/include/linux/types.h b/include/uapi/linux/types.h index c640657a..c640657a 100644 --- a/include/linux/types.h +++ b/include/uapi/linux/types.h diff --git a/include/linux/unix_diag.h b/include/uapi/linux/unix_diag.h index 1eb0b8dd..1eb0b8dd 100644 --- a/include/linux/unix_diag.h +++ b/include/uapi/linux/unix_diag.h diff --git a/include/linux/veth.h b/include/uapi/linux/veth.h index 3354c1eb..3354c1eb 100644 --- a/include/linux/veth.h +++ b/include/uapi/linux/veth.h diff --git a/include/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5790293b..5790293b 100644 --- a/include/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h diff --git a/include/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 861440a8..13875a3f 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -1,5 +1,5 @@ -#ifndef _UAPI_RDMA_NETLINK_H -#define _UAPI_RDMA_NETLINK_H +#ifndef _RDMA_NETLINK_H +#define _RDMA_NETLINK_H #include <linux/types.h> @@ -304,4 +304,4 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_MAX }; -#endif /* _UAPI_RDMA_NETLINK_H */ +#endif /* _RDMA_NETLINK_H */ diff --git a/include/utils.h b/include/utils.h index c9ed230b..3d91c50d 100644 --- a/include/utils.h +++ b/include/utils.h @@ -133,6 +133,8 @@ void missarg(const char *) __attribute__((noreturn)); void invarg(const char *, const char *) __attribute__((noreturn)); void duparg(const char *, const char *) __attribute__((noreturn)); void duparg2(const char *, const char *) __attribute__((noreturn)); +int check_ifname(const char *); +int get_ifname(char *, const char *); int matches(const char *arg, const char *pattern); int inet_addr_match(const inet_prefix *a, const inet_prefix *b, int bits); @@ -193,6 +195,8 @@ static inline void __jiffies_to_tv(struct timeval *tv, unsigned long jiffies) tv->tv_usec = tvusec - 1000000 * tv->tv_sec; } +void print_escape_buf(const __u8 *buf, size_t len, const char *escape); + int print_timestamp(FILE *fp); void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n); diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index b4a7def1..bc44bef7 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -136,7 +136,7 @@ static void print_tunnel(struct ip6_tnl_parm2 *p) static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) { int count = 0; - char medium[IFNAMSIZ] = {}; + const char *medium = NULL; while (argc > 0) { if (strcmp(*argv, "mode") == 0) { @@ -180,7 +180,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) memcpy(&p->laddr, &laddr.data, sizeof(p->laddr)); } else if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); - strncpy(medium, *argv, IFNAMSIZ - 1); + medium = *argv; } else if (strcmp(*argv, "encaplimit") == 0) { NEXT_ARG(); if (strcmp(*argv, "none") == 0) { @@ -273,7 +273,8 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) usage(); if (p->name[0]) duparg2("name", *argv); - strncpy(p->name, *argv, IFNAMSIZ - 1); + if (get_ifname(p->name, *argv)) + invarg("\"name\" not a valid ifname", *argv); if (cmd == SIOCCHGTUNNEL && count == 0) { struct ip6_tnl_parm2 old_p = {}; @@ -285,7 +286,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) count++; argc--; argv++; } - if (medium[0]) { + if (medium) { p->link = ll_name_to_index(medium); if (p->link == 0) { fprintf(stderr, "Cannot find device \"%s\"\n", medium); diff --git a/ip/ipl2tp.c b/ip/ipl2tp.c index 88664c90..1e37b175 100644 --- a/ip/ipl2tp.c +++ b/ip/ipl2tp.c @@ -182,7 +182,7 @@ static int create_session(struct l2tp_parm *p) if (p->peer_cookie_len) addattr_l(&req.n, 1024, L2TP_ATTR_PEER_COOKIE, p->peer_cookie, p->peer_cookie_len); - if (p->ifname && p->ifname[0]) + if (p->ifname) addattrstrz(&req.n, 1024, L2TP_ATTR_IFNAME, p->ifname); if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0) @@ -545,6 +545,8 @@ static int parse_args(int argc, char **argv, int cmd, struct l2tp_parm *p) } } else if (strcmp(*argv, "name") == 0) { NEXT_ARG(); + if (check_ifname(*argv)) + invarg("\"name\" not a valid ifname", *argv); p->ifname = *argv; } else if (strcmp(*argv, "remote") == 0) { NEXT_ARG(); diff --git a/ip/iplink.c b/ip/iplink.c index ff5b56c0..6a96ea9f 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -573,6 +573,8 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, req->i.ifi_flags &= ~IFF_UP; } else if (strcmp(*argv, "name") == 0) { NEXT_ARG(); + if (check_ifname(*argv)) + invarg("\"name\" not a valid ifname", *argv); *name = *argv; } else if (strcmp(*argv, "index") == 0) { NEXT_ARG(); @@ -848,6 +850,8 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, NEXT_ARG(); if (*dev) duparg2("dev", *argv); + if (check_ifname(*argv)) + invarg("\"dev\" not a valid ifname", *argv); *dev = *argv; dev_index = ll_name_to_index(*dev); } @@ -870,7 +874,6 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv) { - int len; char *dev = NULL; char *name = NULL; char *link = NULL; @@ -960,13 +963,8 @@ static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv) } if (name) { - len = strlen(name) + 1; - if (len == 1) - invarg("\"\" is not a valid device identifier\n", - "name"); - if (len > IFNAMSIZ) - invarg("\"name\" too long\n", name); - addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, len); + addattr_l(&req.n, sizeof(req), + IFLA_IFNAME, name, strlen(name) + 1); } if (type) { @@ -1016,7 +1014,6 @@ static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv) int iplink_get(unsigned int flags, char *name, __u32 filt_mask) { - int len; struct iplink_req req = { .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .n.nlmsg_flags = NLM_F_REQUEST | flags, @@ -1029,13 +1026,8 @@ int iplink_get(unsigned int flags, char *name, __u32 filt_mask) } answer; if (name) { - len = strlen(name) + 1; - if (len == 1) - invarg("\"\" is not a valid device identifier\n", - "name"); - if (len > IFNAMSIZ) - invarg("\"name\" too long\n", name); - addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, len); + addattr_l(&req.n, sizeof(req), + IFLA_IFNAME, name, strlen(name) + 1); } addattr32(&req.n, sizeof(req), IFLA_EXT_MASK, filt_mask); @@ -1265,6 +1257,8 @@ static int do_set(int argc, char **argv) flags &= ~IFF_UP; } else if (strcmp(*argv, "name") == 0) { NEXT_ARG(); + if (check_ifname(*argv)) + invarg("\"name\" not a valid ifname", *argv); newname = *argv; } else if (matches(*argv, "address") == 0) { NEXT_ARG(); @@ -1355,6 +1349,8 @@ static int do_set(int argc, char **argv) if (dev) duparg2("dev", *argv); + if (check_ifname(*argv)) + invarg("\"dev\" not a valid ifname", *argv); dev = *argv; } argc--; argv++; @@ -1383,9 +1379,6 @@ static int do_set(int argc, char **argv) } if (newname && strcmp(dev, newname)) { - if (strlen(newname) == 0) - invarg("\"\" is not a valid device identifier\n", - "name"); if (do_changename(dev, newname) < 0) return -1; dev = newname; diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c index 85a69e77..5683f6fa 100644 --- a/ip/ipmaddr.c +++ b/ip/ipmaddr.c @@ -284,7 +284,8 @@ static int multiaddr_modify(int cmd, int argc, char **argv) NEXT_ARG(); if (ifr.ifr_name[0]) duparg("dev", *argv); - strncpy(ifr.ifr_name, *argv, IFNAMSIZ); + if (get_ifname(ifr.ifr_name, *argv)) + invarg("\"dev\" not a valid ifname", *argv); } else { if (matches(*argv, "address") == 0) { NEXT_ARG(); diff --git a/ip/iproute.c b/ip/iproute.c index a8733f45..e81bc05e 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -574,10 +574,10 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) for (i = 2; i <= RTAX_MAX; i++) { __u32 val = 0U; - if (mxrta[i] == NULL) + if (mxrta[i] == NULL && !(mxlock & (1 << i))) continue; - if (i != RTAX_CC_ALGO) + if (mxrta[i] != NULL && i != RTAX_CC_ALGO) val = rta_getattr_u32(mxrta[i]); if (i == RTAX_HOPLIMIT && (int)val == -1) diff --git a/ip/iprule.c b/ip/iprule.c index 8313138d..36c57fa7 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -472,11 +472,13 @@ static int iprule_list_flush_or_save(int argc, char **argv, int action) } else if (strcmp(*argv, "dev") == 0 || strcmp(*argv, "iif") == 0) { NEXT_ARG(); - strncpy(filter.iif, *argv, IFNAMSIZ); + if (get_ifname(filter.iif, *argv)) + invarg("\"iif\"/\"dev\" not a valid ifname", *argv); filter.iifmask = 1; } else if (strcmp(*argv, "oif") == 0) { NEXT_ARG(); - strncpy(filter.oif, *argv, IFNAMSIZ); + if (get_ifname(filter.oif, *argv)) + invarg("\"oif\" not a valid ifname", *argv); filter.oifmask = 1; } else if (strcmp(*argv, "l3mdev") == 0) { filter.l3mdev = 1; @@ -695,10 +697,14 @@ static int iprule_modify(int cmd, int argc, char **argv) } else if (strcmp(*argv, "dev") == 0 || strcmp(*argv, "iif") == 0) { NEXT_ARG(); + if (check_ifname(*argv)) + invarg("\"iif\"/\"dev\" not a valid ifname", *argv); addattr_l(&req.n, sizeof(req), FRA_IFNAME, *argv, strlen(*argv)+1); } else if (strcmp(*argv, "oif") == 0) { NEXT_ARG(); + if (check_ifname(*argv)) + invarg("\"oif\" not a valid ifname", *argv); addattr_l(&req.n, sizeof(req), FRA_OIFNAME, *argv, strlen(*argv)+1); } else if (strcmp(*argv, "l3mdev") == 0) { diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 105d0f55..208a1f06 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -60,7 +60,7 @@ static void set_tunnel_proto(struct ip_tunnel_parm *p, int proto) static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) { int count = 0; - char medium[IFNAMSIZ] = {}; + const char *medium = NULL; int isatap = 0; memset(p, 0, sizeof(*p)); @@ -139,7 +139,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) p->iph.saddr = htonl(INADDR_ANY); } else if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); - strncpy(medium, *argv, IFNAMSIZ - 1); + medium = *argv; } else if (strcmp(*argv, "ttl") == 0 || strcmp(*argv, "hoplimit") == 0 || strcmp(*argv, "hlim") == 0) { @@ -178,7 +178,8 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) if (p->name[0]) duparg2("name", *argv); - strncpy(p->name, *argv, IFNAMSIZ - 1); + if (get_ifname(p->name, *argv)) + invarg("\"name\" not a valid ifname", *argv); if (cmd == SIOCCHGTUNNEL && count == 0) { struct ip_tunnel_parm old_p = {}; @@ -216,7 +217,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) } } - if (medium[0]) { + if (medium) { p->link = ll_name_to_index(medium); if (p->link == 0) { fprintf(stderr, "Cannot find device \"%s\"\n", medium); @@ -465,9 +466,8 @@ static int do_prl(int argc, char **argv) { struct ip_tunnel_prl p = {}; int count = 0; - int devname = 0; int cmd = 0; - char medium[IFNAMSIZ] = {}; + const char *medium = NULL; while (argc > 0) { if (strcmp(*argv, "prl-default") == 0) { @@ -488,8 +488,9 @@ static int do_prl(int argc, char **argv) count++; } else if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); - strncpy(medium, *argv, IFNAMSIZ-1); - devname++; + if (check_ifname(*argv)) + invarg("\"dev\" not a valid ifname", *argv); + medium = *argv; } else { fprintf(stderr, "Invalid PRL parameter \"%s\"\n", *argv); @@ -502,7 +503,7 @@ static int do_prl(int argc, char **argv) } argc--; argv++; } - if (devname == 0) { + if (!medium) { fprintf(stderr, "Must specify device\n"); exit(-1); } @@ -513,9 +514,8 @@ static int do_prl(int argc, char **argv) static int do_6rd(int argc, char **argv) { struct ip_tunnel_6rd ip6rd = {}; - int devname = 0; int cmd = 0; - char medium[IFNAMSIZ] = {}; + const char *medium = NULL; inet_prefix prefix; while (argc > 0) { @@ -537,8 +537,9 @@ static int do_6rd(int argc, char **argv) cmd = SIOCDEL6RD; } else if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); - strncpy(medium, *argv, IFNAMSIZ-1); - devname++; + if (check_ifname(*argv)) + invarg("\"dev\" not a valid ifname", *argv); + medium = *argv; } else { fprintf(stderr, "Invalid 6RD parameter \"%s\"\n", *argv); @@ -546,7 +547,7 @@ static int do_6rd(int argc, char **argv) } argc--; argv++; } - if (devname == 0) { + if (!medium) { fprintf(stderr, "Must specify device\n"); exit(-1); } diff --git a/ip/iptuntap.c b/ip/iptuntap.c index 451f7f0e..b46e452f 100644 --- a/ip/iptuntap.c +++ b/ip/iptuntap.c @@ -176,7 +176,8 @@ static int parse_args(int argc, char **argv, ifr->ifr_flags |= IFF_MULTI_QUEUE; } else if (matches(*argv, "dev") == 0) { NEXT_ARG(); - strncpy(ifr->ifr_name, *argv, IFNAMSIZ-1); + if (get_ifname(ifr->ifr_name, *argv)) + invarg("\"dev\" not a valid ifname", *argv); } else { if (matches(*argv, "name") == 0) { NEXT_ARG(); @@ -184,7 +185,8 @@ static int parse_args(int argc, char **argv, usage(); if (ifr->ifr_name[0]) duparg2("name", *argv); - strncpy(ifr->ifr_name, *argv, IFNAMSIZ); + if (get_ifname(ifr->ifr_name, *argv)) + invarg("\"name\" not a valid ifname", *argv); } count++; argc--; argv++; diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c index 4483fb8f..99fdec23 100644 --- a/ip/xfrm_state.c +++ b/ip/xfrm_state.c @@ -539,7 +539,7 @@ static int xfrm_state_modify(int cmd, unsigned int flags, int argc, char **argv) xfrm_algo_parse((void *)&alg, type, name, key, buf, sizeof(alg.buf)); - len += alg.u.alg.alg_key_len; + len += alg.u.alg.alg_key_len / 8; addattr_l(&req.n, sizeof(req.buf), type, (void *)&alg, len); diff --git a/lib/color.c b/lib/color.c index 79d5e289..8d049a01 100644 --- a/lib/color.c +++ b/lib/color.c @@ -45,8 +45,8 @@ static const char * const color_codes[] = { NULL, }; -static enum color attr_colors[] = { - /* light background */ +/* light background */ +static enum color attr_colors_light[] = { C_CYAN, C_YELLOW, C_MAGENTA, @@ -54,8 +54,10 @@ static enum color attr_colors[] = { C_GREEN, C_RED, C_CLEAR, +}; - /* dark background */ +/* dark background */ +static enum color attr_colors_dark[] = { C_BOLD_CYAN, C_BOLD_YELLOW, C_BOLD_MAGENTA, @@ -109,8 +111,9 @@ int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...) goto end; } - ret += fprintf(fp, "%s", - color_codes[attr_colors[is_dark_bg ? attr + 8 : attr]]); + ret += fprintf(fp, "%s", color_codes[is_dark_bg ? + attr_colors_dark[attr] : attr_colors_light[attr]]); + ret += vfprintf(fp, fmt, args); ret += fprintf(fp, "%s", color_codes[C_CLEAR]); @@ -127,7 +130,7 @@ enum color_attr ifa_family_color(__u8 ifa_family) case AF_INET6: return COLOR_INET6; default: - return COLOR_CLEAR; + return COLOR_NONE; } } @@ -139,6 +142,6 @@ enum color_attr oper_state_color(__u8 state) case IF_OPER_DOWN: return COLOR_OPERSTATE_DOWN; default: - return COLOR_CLEAR; + return COLOR_NONE; } } diff --git a/lib/utils.c b/lib/utils.c index bbd3cbc4..ac155bf5 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -20,6 +20,7 @@ #include <sys/socket.h> #include <netinet/in.h> #include <string.h> +#include <ctype.h> #include <netdb.h> #include <arpa/inet.h> #include <asm/types.h> @@ -30,6 +31,7 @@ #include <time.h> #include <sys/time.h> #include <errno.h> +#include <ctype.h> #include "rt_names.h" #include "utils.h" @@ -699,6 +701,34 @@ void duparg2(const char *key, const char *arg) exit(-1); } +int check_ifname(const char *name) +{ + /* These checks mimic kernel checks in dev_valid_name */ + if (*name == '\0') + return -1; + if (strlen(name) >= IFNAMSIZ) + return -1; + + while (*name) { + if (*name == '/' || isspace(*name)) + return -1; + ++name; + } + return 0; +} + +/* buf is assumed to be IFNAMSIZ */ +int get_ifname(char *buf, const char *name) +{ + int ret; + + ret = check_ifname(name); + if (ret == 0) + strncpy(buf, name, IFNAMSIZ); + + return ret; +} + int matches(const char *cmd, const char *pattern) { int len = strlen(cmd); @@ -1018,6 +1048,20 @@ int addr64_n2a(__u64 addr, char *buff, size_t len) return written; } +/* Print buffer and escape bytes that are !isprint or among 'escape' */ +void print_escape_buf(const __u8 *buf, size_t len, const char *escape) +{ + size_t i; + + for (i = 0; i < len; ++i) { + if (isprint(buf[i]) && buf[i] != '\\' && + !strchr(escape, buf[i])) + printf("%c", buf[i]); + else + printf("\\%03o", buf[i]); + } +} + int print_timestamp(FILE *fp) { struct timeval tv; @@ -1231,6 +1275,7 @@ int get_real_family(int rtm_type, int rtm_family) return rtm_family; } +#ifdef NEED_STRLCPY size_t strlcpy(char *dst, const char *src, size_t size) { size_t srclen = strlen(src); @@ -1253,3 +1298,4 @@ size_t strlcat(char *dst, const char *src, size_t size) return dlen + strlcpy(dst + dlen, src, size - dlen); } +#endif diff --git a/misc/arpd.c b/misc/arpd.c index bfab4454..c2666f76 100644 --- a/misc/arpd.c +++ b/misc/arpd.c @@ -664,7 +664,8 @@ int main(int argc, char **argv) struct ifreq ifr = {}; for (i = 0; i < ifnum; i++) { - strncpy(ifr.ifr_name, ifnames[i], IFNAMSIZ); + if (get_ifname(ifr.ifr_name, ifnames[i])) + invarg("not a valid ifname", ifnames[i]); if (ioctl(udp_sock, SIOCGIFINDEX, &ifr)) { perror("ioctl(SIOCGIFINDEX)"); exit(-1); @@ -2153,6 +2153,16 @@ static void print_skmeminfo(struct rtattr *tb[], int attrtype) printf(")"); } +static void print_md5sig(struct tcp_diag_md5sig *sig) +{ + printf("%s/%d=", + format_host(sig->tcpm_family, + sig->tcpm_family == AF_INET6 ? 16 : 4, + &sig->tcpm_addr), + sig->tcpm_prefixlen); + print_escape_buf(sig->tcpm_key, sig->tcpm_keylen, " ,"); +} + #define TCPI_HAS_OPT(info, opt) !!(info->tcpi_options & (opt)) static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r, @@ -2289,6 +2299,17 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r, free(s.dctcp); free(s.bbr_info); } + if (tb[INET_DIAG_MD5SIG]) { + struct tcp_diag_md5sig *sig = RTA_DATA(tb[INET_DIAG_MD5SIG]); + int len = RTA_PAYLOAD(tb[INET_DIAG_MD5SIG]); + + printf(" md5keys:"); + print_md5sig(sig++); + for (len -= sizeof(*sig); len > 0; len -= sizeof(*sig)) { + printf(","); + print_md5sig(sig++); + } + } } static const char *format_host_sa(struct sockaddr_storage *sa) diff --git a/tc/f_flower.c b/tc/f_flower.c index 934832e2..b1802107 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -629,11 +629,10 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, } else if (matches(*argv, "skip_sw") == 0) { flags |= TCA_CLS_FLAGS_SKIP_SW; } else if (matches(*argv, "indev") == 0) { - char ifname[IFNAMSIZ] = {}; - NEXT_ARG(); - strncpy(ifname, *argv, sizeof(ifname) - 1); - addattrstrz(n, MAX_MSG, TCA_FLOWER_INDEV, ifname); + if (check_ifname(*argv)) + invarg("\"indev\" not a valid ifname", *argv); + addattrstrz(n, MAX_MSG, TCA_FLOWER_INDEV, *argv); } else if (matches(*argv, "vlan_id") == 0) { __u16 vid; @@ -385,8 +385,7 @@ static int parse_ip6_addr(int *argc_p, char ***argv_p, plen = addr.bitlen; for (i = 0; i < plen; i += 32) { - /* if (((i + 31) & ~0x1F) <= plen) { */ - if (i + 31 <= plen) { + if (i + 31 < plen) { res = pack_key(sel, addr.data[i / 32], 0xFFFFFFFF, off + 4 * (i / 32), offmask); if (res < 0) diff --git a/tc/q_netem.c b/tc/q_netem.c index 5a9e7474..cdaddce9 100644 --- a/tc/q_netem.c +++ b/tc/q_netem.c @@ -231,7 +231,7 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv, if (!strcmp(*argv, "random")) { NEXT_ARG(); -random_loss_model: + random_loss_model: if (get_percent(&opt.loss, *argv)) { explain1("loss percent"); return -1; @@ -338,7 +338,7 @@ random_loss_model: return -1; } } else if (matches(*argv, "ecn") == 0) { - present[TCA_NETEM_ECN] = 1; + present[TCA_NETEM_ECN] = 1; } else if (matches(*argv, "reorder") == 0) { NEXT_ARG(); present[TCA_NETEM_REORDER] = 1; @@ -469,7 +469,7 @@ random_loss_model: if (present[TCA_NETEM_CORR] && addattr_l(n, 1024, TCA_NETEM_CORR, &cor, sizeof(cor)) < 0) - return -1; + return -1; if (present[TCA_NETEM_REORDER] && addattr_l(n, 1024, TCA_NETEM_REORDER, &reorder, sizeof(reorder)) < 0) @@ -478,7 +478,7 @@ random_loss_model: if (present[TCA_NETEM_ECN] && addattr_l(n, 1024, TCA_NETEM_ECN, &present[TCA_NETEM_ECN], sizeof(present[TCA_NETEM_ECN])) < 0) - return -1; + return -1; if (present[TCA_NETEM_CORRUPT] && addattr_l(n, 1024, TCA_NETEM_CORRUPT, &corrupt, sizeof(corrupt)) < 0) @@ -491,11 +491,11 @@ random_loss_model: if (loss_type == NETEM_LOSS_GI) { if (addattr_l(n, 1024, NETEM_LOSS_GI, &gimodel, sizeof(gimodel)) < 0) - return -1; + return -1; } else if (loss_type == NETEM_LOSS_GE) { if (addattr_l(n, 1024, NETEM_LOSS_GE, &gemodel, sizeof(gemodel)) < 0) - return -1; + return -1; } else { fprintf(stderr, "loss in the weeds!\n"); return -1; diff --git a/testsuite/tests/ip/link/new_link.t b/testsuite/tests/ip/link/new_link.t index 699adbcd..c17650a2 100755 --- a/testsuite/tests/ip/link/new_link.t +++ b/testsuite/tests/ip/link/new_link.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh ts_log "[Testing add/del virtual links]" diff --git a/testsuite/tests/ip/link/show_dev_wo_vf_rate.t b/testsuite/tests/ip/link/show_dev_wo_vf_rate.t index a600ba65..5b3c004e 100755 --- a/testsuite/tests/ip/link/show_dev_wo_vf_rate.t +++ b/testsuite/tests/ip/link/show_dev_wo_vf_rate.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh NL_FILE="tests/ip/link/dev_wo_vf_rate.nl" ts_ip "$0" "Show VF devices w/o VF rate info" -d monitor file $NL_FILE diff --git a/testsuite/tests/ip/netns/set_nsid.t b/testsuite/tests/ip/netns/set_nsid.t index 606d45ab..8f8c7792 100755 --- a/testsuite/tests/ip/netns/set_nsid.t +++ b/testsuite/tests/ip/netns/set_nsid.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh ts_log "[Testing netns nsid]" diff --git a/testsuite/tests/ip/netns/set_nsid_batch.t b/testsuite/tests/ip/netns/set_nsid_batch.t index abb3f1bb..196fd4b3 100755 --- a/testsuite/tests/ip/netns/set_nsid_batch.t +++ b/testsuite/tests/ip/netns/set_nsid_batch.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh ts_log "[Testing netns nsid in batch mode]" diff --git a/testsuite/tests/ip/route/add_default_route.t b/testsuite/tests/ip/route/add_default_route.t index e5ea6473..569ba1f8 100755 --- a/testsuite/tests/ip/route/add_default_route.t +++ b/testsuite/tests/ip/route/add_default_route.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh ts_log "[Testing add default route]" diff --git a/testsuite/tests/ip/tunnel/add_tunnel.t b/testsuite/tests/ip/tunnel/add_tunnel.t index 18f6e370..3f5a9d3c 100755 --- a/testsuite/tests/ip/tunnel/add_tunnel.t +++ b/testsuite/tests/ip/tunnel/add_tunnel.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh TUNNEL_NAME="tunnel_test_ip" diff --git a/testsuite/tests/tc/cls-testbed.t b/testsuite/tests/tc/cls-testbed.t index 2afc26fc..d5c21e5c 100755 --- a/testsuite/tests/tc/cls-testbed.t +++ b/testsuite/tests/tc/cls-testbed.t @@ -1,7 +1,7 @@ #!/bin/bash # vim: ft=sh -source lib/generic.sh +. lib/generic.sh QDISCS="cbq htb dsmark" diff --git a/testsuite/tests/tc/dsmark.t b/testsuite/tests/tc/dsmark.t index 6934165e..177585e6 100755 --- a/testsuite/tests/tc/dsmark.t +++ b/testsuite/tests/tc/dsmark.t @@ -1,7 +1,7 @@ #!/bin/bash # vim: ft=sh -source lib/generic.sh +. lib/generic.sh ts_qdisc_available "dsmark" if [ $? -eq 0 ]; then diff --git a/testsuite/tests/tc/pedit.t b/testsuite/tests/tc/pedit.t index e9b6c333..8d531a05 100755 --- a/testsuite/tests/tc/pedit.t +++ b/testsuite/tests/tc/pedit.t @@ -1,6 +1,6 @@ #!/bin/sh -source lib/generic.sh +. lib/generic.sh DEV="$(rand_dev)" ts_ip "$0" "Add $DEV dummy interface" link add dev $DEV type dummy diff --git a/tipc/Makefile b/tipc/Makefile index 2212beb0..d3c957e2 100644 --- a/tipc/Makefile +++ b/tipc/Makefile @@ -10,8 +10,6 @@ TIPCOBJ=bearer.o \ peer.o tipc.o TARGETS=tipc -CFLAGS += $(shell $(PKG_CONFIG) libmnl --cflags) -LDLIBS += $(shell $(PKG_CONFIG) libmnl --libs) endif |