Merge branch 'master' into net-next

This commit is contained in:
Stephen Hemminger 2017-09-29 10:51:25 -07:00
commit ee7bfb52a7
18 changed files with 17 additions and 5842 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
# locally generated
Config
static-syms.h
config.*
*.o

View File

@ -73,7 +73,7 @@ install: all
$(DESTDIR)$(DOCDIR)/examples
install -m 0644 $(shell find examples/diffserv -maxdepth 1 -type f) \
$(DESTDIR)$(DOCDIR)/examples/diffserv
@for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done
@for i in $(SUBDIRS); do $(MAKE) -C $$i install; done
install -m 0644 $(shell find etc/iproute2 -maxdepth 1 -type f) $(DESTDIR)$(CONFDIR)
install -m 0755 -d $(DESTDIR)$(BASH_COMPDIR)
install -m 0644 bash-completion/tc $(DESTDIR)$(BASH_COMPDIR)
@ -84,7 +84,7 @@ snapshot:
> include/SNAPSHOT.h
clean:
@for i in $(SUBDIRS) doc; \
@for i in $(SUBDIRS); \
do $(MAKE) $(MFLAGS) -C $$i clean; done
clobber:

View File

@ -1,73 +0,0 @@
PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps tc-filters.ps
# tc-cref.ps
# api-rtnl.tex api-pmtudisc.tex api-news.tex
# iki-netdev.ps iki-neighdst.ps
LATEX=latex
DVIPS=dvips
SGML2DVI=sgml2latex
SGML2HTML=sgml2html -s 0
LPR=lpr -Zsduplex
SHELL=bash
PAGESIZE=a4
PAGESPERPAGE=2
HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml))
DVIFILES=$(subst .ps,.dvi,$(PSFILES))
PDFFILES=$(subst .ps,.pdf,$(PSFILES))
all: pstwocol
pstwocol: $(PSFILES)
html: $(HTMLFILES)
dvi: $(DVIFILES)
pdf: $(PDFFILES)
print: $(PSFILES)
$(LPR) $(PSFILES)
%.tex: %.sgml
$(SGML2DVI) --output=tex $<
%.dvi: %.sgml
$(SGML2DVI) --output=dvi $<
%.dvi: %.tex
@set -e; pass=2; echo "Running LaTeX $<"; \
while [ `$(LATEX) $< </dev/null 2>&1 | \
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
if [ $$pass -gt 3 ]; then \
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
fi; \
echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
done
%.pdf: %.tex
@set -e; pass=2; echo "Running pdfLaTeX $<"; \
while [ `pdflatex $< </dev/null 2>&1 | \
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
if [ $$pass -gt 3 ]; then \
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
fi; \
echo "Re-running pdfLaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
done
#%.pdf: %.ps
# ps2pdf $<
%.ps: %.dvi
$(DVIPS) $< -o $@
%.html: %.sgml
$(SGML2HTML) $<
install:
install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR)
install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR)
clean:
rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html *.pdf

View File

@ -1,16 +0,0 @@
Partially finished work.
1. User Reference manuals.
1.1 IP Command reference (ip-cref.tex, published)
1.2 TC Command reference (tc-cref.tex)
1.3 IP tunnels (ip-tunnels.tex, published)
2. Linux-2.2 Networking API
2.1 RTNETLINK (api-rtnl.tex)
2.2 Path MTU Discovery (api-pmtudisc.tex)
2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published)
2.4 Miscellaneous extensions (api-misc.tex)
3. Linux-2.2 Networking Intra-Kernel Interfaces
3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex)
3.2 Neighbour cache and destination cache. (iki-neighdst.tex)

View File

@ -1 +0,0 @@
\def\Draft{020116}

View File

@ -1,429 +0,0 @@
\documentstyle[12pt,twoside]{article}
\def\TITLE{IPv6 Flow Labels}
\input preamble
\begin{center}
\Large\bf IPv6 Flow Labels in Linux-2.2.
\end{center}
\begin{center}
{ \large Alexey~N.~Kuznetsov } \\
\em Institute for Nuclear Research, Moscow \\
\verb|kuznet@ms2.inr.ac.ru| \\
\rm April 11, 1999
\end{center}
\vspace{5mm}
\tableofcontents
\section{Introduction.}
Every IPv6 packet carries 28 bits of flow information. RFC2460 splits
these bits to two fields: 8 bits of traffic class (or DS field, if you
prefer this term) and 20 bits of flow label. Currently there exist
no well-defined API to manage IPv6 flow information. In this document
I describe an attempt to design the API for Linux-2.2 IPv6 stack.
\vskip 1mm
The API must solve the following tasks:
\begin{enumerate}
\item To allow user to set traffic class bits.
\item To allow user to read traffic class bits of received packets.
This feature is not so useful as the first one, however it will be
necessary f.e.\ to implement ECN [RFC2481] for datagram oriented services
or to implement receiver side of SRP or another end-to-end protocol
using traffic class bits.
\item To assign flow labels to packets sent by user.
\item To get flow labels of received packets. I do not know
any applications of this feature, but it is possible that receiver will
want to use flow labels to distinguish sub-flows.
\item To allocate flow labels in the way, compliant to RFC2460. Namely:
\begin{itemize}
\item
Flow labels must be uniformly distributed (pseudo-)random numbers,
so that any subset of 20 bits can be used as hash key.
\item
Flows with coinciding source address and flow label must have identical
destination address and not-fragmentable extensions headers (i.e.\
hop by hop options and all the headers up to and including routing header,
if it is present.)
\begin{NB}
There is a hole in specs: some hop-by-hop options can be
defined only on per-packet base (f.e.\ jumbo payload option).
Essentially, it means that such options cannot present in packets
with flow labels.
\end{NB}
\begin{NB}
NB notes here and below reflect only my personal opinion,
they should be read with smile or should not be read at all :-).
\end{NB}
\item
Flow labels have finite lifetime and source is not allowed to reuse
flow label for another flow within the maximal lifetime has expired,
so that intermediate nodes will be able to invalidate flow state before
the label is taken over by another flow.
Flow state, including lifetime, is propagated along datagram path
by some application specific methods
(f.e.\ in RSVP PATH messages or in some hop-by-hop option).
\end{itemize}
\end{enumerate}
\section{Sending/receiving flow information.}
\paragraph{Discussion.}
\addcontentsline{toc}{subsection}{Discussion}
It was proposed (Where? I do not remember any explicit statement)
to solve the first four tasks using
\verb|sin6_flowinfo| field added to \verb|struct| \verb|sockaddr_in6|
(see RFC2553).
\begin{NB}
This method is difficult to consider as reasonable, because it
puts additional overhead to all the services, despite of only
very small subset of them (none, to be more exact) really use it.
It contradicts both to IETF spirit and the letter. Before RFC2553
one justification existed, IPv6 address alignment left 4 byte
hole in \verb|sockaddr_in6| in any case. Now it has no justification.
\end{NB}
We have two problems with this method. The first one is common for all OSes:
if \verb|recvmsg()| initializes \verb|sin6_flowinfo| to flow info
of received packet, we loose one very important property of BSD socket API,
namely, we are not allowed to use received address for reply directly
and have to mangle it, even if we are not interested in flowinfo subtleties.
\begin{NB}
RFC2553 adds new requirement: to clear \verb|sin6_flowinfo|.
Certainly, it is not solution but rather attempt to force applications
to make unnecessary work. Well, as usually, one mistake in design
is followed by attempts to patch the hole and more mistakes...
\end{NB}
Another problem is Linux specific. Historically Linux IPv6 did not
initialize \verb|sin6_flowinfo| at all, so that, if kernel does not
support flow labels, this field is not zero, but a random number.
Some applications also did not take care about it.
\begin{NB}
Following RFC2553 such applications can be considered as broken,
but I still think that they are right: clearing all the address
before filling known fields is robust but stupid solution.
Useless wasting CPU cycles and
memory bandwidth is not a good idea. Such patches are acceptable
as temporary hacks, but not as standard of the future.
\end{NB}
\paragraph{Implementation.}
\addcontentsline{toc}{subsection}{Implementation}
By default Linux IPv6 does not read \verb|sin6_flowinfo| field
assuming that common applications are not obliged to initialize it
and are permitted to consider it as pure alignment padding.
In order to tell kernel that application
is aware of this field, it is necessary to set socket option
\verb|IPV6_FLOWINFO_SEND|.
\begin{verbatim}
int on = 1;
setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO_SEND,
(void*)&on, sizeof(on));
\end{verbatim}
Linux kernel never fills \verb|sin6_flowinfo| field, when passing
message to user space, though the kernels which support flow labels
initialize it to zero. If user wants to get received flowinfo, he
will set option \verb|IPV6_FLOWINFO| and after this he will receive
flowinfo as ancillary data object of type \verb|IPV6_FLOWINFO|
(cf.\ RFC2292).
\begin{verbatim}
int on = 1;
setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO, (void*)&on, sizeof(on));
\end{verbatim}
Flowinfo received and latched by a connected TCP socket also may be fetched
with \verb|getsockopt()| \verb|IPV6_PKTOPTIONS| together with
another optional information.
Besides that, in the spirit of RFC2292 the option \verb|IPV6_FLOWINFO|
may be used as alternative way to send flowinfo with \verb|sendmsg()| or
to latch it with \verb|IPV6_PKTOPTIONS|.
\paragraph{Note about IPv6 options and destination address.}
\addcontentsline{toc}{subsection}{IPv6 options and destination address}
If \verb|sin6_flowinfo| does contain not zero flow label,
destination address in \verb|sin6_addr| and non-fragmentable
extension headers are ignored. Instead, kernel uses the values
cached at flow setup (see below). However, for connected sockets
kernel prefers the values set at connection time.
\paragraph{Example.}
\addcontentsline{toc}{subsection}{Example}
After setting socket option \verb|IPV6_FLOWINFO|
flowlabel and DS field are received as ancillary data object
of type \verb|IPV6_FLOWINFO| and level \verb|SOL_IPV6|.
In the cases when it is convenient to use \verb|recvfrom(2)|,
it is possible to replace library variant with your own one,
sort of:
\begin{verbatim}
#include <sys/socket.h>
#include <netinet/in6.h>
size_t recvfrom(int fd, char *buf, size_t len, int flags,
struct sockaddr *addr, int *addrlen)
{
size_t cc;
char cbuf[128];
struct cmsghdr *c;
struct iovec iov = { buf, len };
struct msghdr msg = { addr, *addrlen,
&iov, 1,
cbuf, sizeof(cbuf),
0 };
cc = recvmsg(fd, &msg, flags);
if (cc < 0)
return cc;
((struct sockaddr_in6*)addr)->sin6_flowinfo = 0;
*addrlen = msg.msg_namelen;
for (c=CMSG_FIRSTHDR(&msg); c; c = CMSG_NEXTHDR(&msg, c)) {
if (c->cmsg_level != SOL_IPV6 ||
c->cmsg_type != IPV6_FLOWINFO)
continue;
((struct sockaddr_in6*)addr)->sin6_flowinfo = *(__u32*)CMSG_DATA(c);
}
return cc;
}
\end{verbatim}
\section{Flow label management.}
\paragraph{Discussion.}
\addcontentsline{toc}{subsection}{Discussion}
Requirements of RFC2460 are pretty tough. Particularly, lifetimes
longer than boot time require to store allocated labels at stable
storage, so that the full implementation necessarily includes user space flow
label manager. There are at least three different approaches:
\begin{enumerate}
\item {\bf ``Cooperative''. } We could leave flow label allocation wholly
to user space. When user needs label he requests manager directly. The approach
is valid, but as any ``cooperative'' approach it suffers of security problems.
\begin{NB}
One idea is to disallow not privileged user to allocate flow
labels, but instead to pass the socket to manager via \verb|SCM_RIGHTS|
control message, so that it will allocate label and assign it to socket
itself. Hmm... the idea is interesting.
\end{NB}
\item {\bf ``Indirect''.} Kernel redirects requests to user level daemon
and does not install label until the daemon acknowledged the request.
The approach is the most promising, it is especially pleasant to recognize
parallel with IPsec API [RFC2367,Craig]. Actually, it may share API with
IPsec.
\item {\bf ``Stupid''.} To allocate labels in kernel space. It is the simplest
method, but it suffers of two serious flaws: the first,
we cannot lease labels with lifetimes longer than boot time, the second,
it is sensitive to DoS attacks. Kernel have to remember all the obsolete
labels until their expiration and malicious user may fastly eat all the
flow label space.
\end{enumerate}
Certainly, I choose the most ``stupid'' method. It is the cheapest one
for implementor (i.e.\ me), and taking into account that flow labels
still have no serious applications it is not useful to work on more
advanced API, especially, taking into account that eventually we
will get it for no fee together with IPsec.
\paragraph{Implementation.}
\addcontentsline{toc}{subsection}{Implementation}
Socket option \verb|IPV6_FLOWLABEL_MGR| allows to
request flow label manager to allocate new flow label, to reuse
already allocated one or to delete old flow label.
Its argument is \verb|struct| \verb|in6_flowlabel_req|:
\begin{verbatim}
struct in6_flowlabel_req
{
struct in6_addr flr_dst;
__u32 flr_label;
__u8 flr_action;
__u8 flr_share;
__u16 flr_flags;
__u16 flr_expires;
__u16 flr_linger;
__u32 __flr_reserved;
/* Options in format of IPV6_PKTOPTIONS */
};
\end{verbatim}
\begin{itemize}
\item \verb|dst| is IPv6 destination address associated with the label.
\item \verb|label| is flow label value in network byte order. If it is zero,
kernel will allocate new pseudo-random number. Otherwise, kernel will try
to lease flow label ordered by user. In this case, it is user task to provide
necessary flow label randomness.
\item \verb|action| is requested operation. Currently, only three operations
are defined:
\begin{verbatim}
#define IPV6_FL_A_GET 0 /* Get flow label */
#define IPV6_FL_A_PUT 1 /* Release flow label */
#define IPV6_FL_A_RENEW 2 /* Update expire time */
\end{verbatim}
\item \verb|flags| are optional modifiers. Currently
only \verb|IPV6_FL_A_GET| has modifiers:
\begin{verbatim}
#define IPV6_FL_F_CREATE 1 /* Allowed to create new label */
#define IPV6_FL_F_EXCL 2 /* Do not create new label */
\end{verbatim}
\item \verb|share| defines who is allowed to reuse the same flow label.
\begin{verbatim}
#define IPV6_FL_S_NONE 0 /* Not defined */
#define IPV6_FL_S_EXCL 1 /* Label is private */
#define IPV6_FL_S_PROCESS 2 /* May be reused by this process */
#define IPV6_FL_S_USER 3 /* May be reused by this user */
#define IPV6_FL_S_ANY 255 /* Anyone may reuse it */
\end{verbatim}
\item \verb|linger| is time in seconds. After the last user releases flow
label, it will not be reused with different destination and options at least
during this time. If \verb|share| is not \verb|IPV6_FL_S_EXCL| the label
still can be shared by another sockets. Current implementation does not allow
unprivileged user to set linger longer than 60 sec.
\item \verb|expires| is time in seconds. Flow label will be kept at least
for this time, but it will not be destroyed before user released it explicitly
or closed all the sockets using it. Current implementation does not allow
unprivileged user to set timeout longer than 60 sec. Proviledged applications
MAY set longer lifetimes, but in this case they MUST save allocated
labels at stable storage and restore them back after reboot before the first
application allocates new flow.
\end{itemize}
This structure is followed by optional extension headers associated
with this flow label in format of \verb|IPV6_PKTOPTIONS|. Only
\verb|IPV6_HOPOPTS|, \verb|IPV6_RTHDR| and, if \verb|IPV6_RTHDR| presents,
\verb|IPV6_DSTOPTS| are allowed.
\paragraph{Example.}
\addcontentsline{toc}{subsection}{Example}
The function \verb|get_flow_label| allocates
private flow label.
\begin{verbatim}
int get_flow_label(int fd, struct sockaddr_in6 *dst, __u32 fl)
{
int on = 1;
struct in6_flowlabel_req freq;
memset(&freq, 0, sizeof(freq));
freq.flr_label = htonl(fl);
freq.flr_action = IPV6_FL_A_GET;
freq.flr_flags = IPV6_FL_F_CREATE | IPV6_FL_F_EXCL;
freq.flr_share = IPV6_FL_S_EXCL;
memcpy(&freq.flr_dst, &dst->sin6_addr, 16);
if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
&freq, sizeof(freq)) == -1) {
perror ("can't lease flowlabel");
return -1;
}
dst->sin6_flowinfo |= freq.flr_label;
if (setsockopt(fd, SOL_IPV6, IPV6_FLOWINFO_SEND,
&on, sizeof(on)) == -1) {
perror ("can't send flowinfo");
freq.flr_action = IPV6_FL_A_PUT;
setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
&freq, sizeof(freq));
return -1;
}
return 0;
}
\end{verbatim}
A bit more complicated example using routing header can be found
in \verb|ping6| utility (\verb|iputils| package). Linux rsvpd backend
contains an example of using operation \verb|IPV6_FL_A_RENEW|.
\paragraph{Listing flow labels.}
\addcontentsline{toc}{subsection}{Listing flow labels}
List of currently allocated
flow labels may be read from \verb|/proc/net/ip6_flowlabel|.
\begin{verbatim}
Label S Owner Users Linger Expires Dst Opt
A1BE5 1 0 0 6 3 3ffe2400000000010a0020fffe71fb30 0
\end{verbatim}
\begin{itemize}
\item \verb|Label| is hexadecimal flow label value.
\item \verb|S| is sharing style.
\item \verb|Owner| is ID of creator, it is zero, pid or uid, depending on
sharing style.
\item \verb|Users| is number of applications using the label now.
\item \verb|Linger| is \verb|linger| of this label in seconds.
\item \verb|Expires| is time until expiration of the label in seconds. It may
be negative, if the label is in use.
\item \verb|Dst| is IPv6 destination address.
\item \verb|Opt| is length of options, associated with the label. Option
data are not accessible.
\end{itemize}
\paragraph{Flow labels and RSVP.}
\addcontentsline{toc}{subsection}{Flow labels and RSVP}
RSVP daemon supports IPv6 flow labels
without any modifications to standard ISI RAPI. Sender must allocate
flow label, fill corresponding sender template and submit it to local rsvp
daemon. rsvpd will check the label and start to announce it in PATH
messages. Rsvpd on sender node will renew the flow label, so that it will not
be reused before path state expires and all the intermediate
routers and receiver purge flow state.
\verb|rtap| utility is modified to parse flow labels. F.e.\ if user allocated
flow label \verb|0xA1234|, he may write:
\begin{verbatim}
RTAP> sender 3ffe:2400::1/FL0xA1234 <Tspec>
\end{verbatim}
Receiver makes reservation with command:
\begin{verbatim}
RTAP> reserve ff 3ffe:2400::1/FL0xA1234 <Flowspec>
\end{verbatim}
\end{document}

View File

@ -1,130 +0,0 @@
<!doctype linuxdoc system>
<article>
<title>ARPD Daemon
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/arpd/ is daemon collecting gratuitous ARP information, saving
it on local disk and feeding it to kernel on demand to avoid
redundant broadcasting due to limited size of kernel ARP cache.
</abstract>
<p><bf/Description/
<p>The format of the command is:
<tscreen><verb>
arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ]
</verb></tscreen>
<p> <tt/OPTIONS/ are:
<itemize>
<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists
of three columns: interface index, IP address and MAC address.
Negative entries for dead hosts are also shown, in this case MAC address
is replaced by word <tt/FAILED/ followed by colon and time when the fact
that host is dead was proven the last time.
<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/
in text format similar dumped by option <tt/-l/. Exit after load,
probably listing resulting database, if option <tt/-l/ is also given.
If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table.
<item><tt/-b DATABASE/ - location of database file. Default location is
<tt>/var/lib/arpd/arpd.db</tt>.
<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but
also send brodcast queries itself. <tt/NUMBER/ is number of such queries
to make before destination is considered as dead. When <tt/arpd/ is started
as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/
or even with option <tt/-k/) without this option and still did not learn enough
information, you can observe 1 second gaps in service. Not fatal, but
not good.
<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes
sense together with option <tt/-a/.
<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/
suppresses further attempts to resolve for this period. It makes sense
only together with option <tt/-k/. This timeout should not be too much
longer than boot time of a typical host not supporting gratuitous ARP.
Default value is 60 seconds.
<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/
in packets per second. Default value is 1.
<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back.
Default value is 3. Together with option <tt/-R/ this option allows
to police broadcasting not to exceed <tt/B+R*T/ over any interval
of time <tt/T/.
</itemize>
<p><tt/INTERFACE/ is name of networking inteface to watch.
If no interfaces given, <tt/arpd/ monitors all the interfaces.
In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters,
it is supposed user does this himself after <tt/arpd/ is started.
<p> Signals
<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted
<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/.
<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics
to <tt/syslog/. Effect of another signals is undefined, they may corrupt
database and leave <tt/sysctl/ parameters in an unpredictable state.
<p> Note
<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be
compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list
is not given on command line, variable <tt/app_solicit/
on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>.
If this is not made <tt/arpd/ still collects gratuitous ARP information
in its database.
<p> Examples
<enum>
<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing
with kernel functionality:
<tscreen><verb>
arpd -b /var/tmp/arpd.db
</verb></tscreen>
<item> Look at result after some time:
<tscreen><verb>
killall arpd
arpd -l -b /var/tmp/arpd.db
</verb></tscreen>
<item> To enable kernel helper, leaving leading role to kernel:
<tscreen><verb>
arpd -b /var/tmp/arpd.db -a 1 eth0 eth1
</verb></tscreen>
<item> Completely replace kernel resolution on interfaces <tt/eth0/
and <tt/eth1/. In this case kernel still does unicast probing to
validate entries, but all the broadcast activity is suppressed
and made under authority of <tt/arpd/:
<tscreen><verb>
arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1
</verb></tscreen>
This is mode which <tt/arpd/ is supposed to work normally.
It is not default just to prevent occasional enabling of too aggressive
mode occasionally.
</enum>
</article>

View File

@ -1,16 +0,0 @@
#! /bin/bash
# $1 = Temporary file . "string"
# $2 = File to process . "string"
# $3 = Page size . ie: a4 , letter ... "string"
# $4 = Number of pages to fit on a single sheet . "numeric"
if type psnup >&/dev/null; then
echo "psnup -$4 -p$3 $1 $2"
psnup -$4 -p$3 $1 $2
elif type psmulti >&/dev/null; then
echo "psmulti $1 > $2"
psmulti $1 > $2
else
echo "cp $1 $2"
cp $1 $2
fi

File diff suppressed because it is too large Load Diff

View File

@ -1,469 +0,0 @@
\documentstyle[12pt,twoside]{article}
\def\TITLE{Tunnels over IP}
\input preamble
\begin{center}
\Large\bf Tunnels over IP in Linux-2.2
\end{center}
\begin{center}
{ \large Alexey~N.~Kuznetsov } \\
\em Institute for Nuclear Research, Moscow \\
\verb|kuznet@ms2.inr.ac.ru| \\
\rm March 17, 1999
\end{center}
\vspace{5mm}
\tableofcontents
\section{Instead of introduction: micro-FAQ.}
\begin{itemize}
\item
Q: In linux-2.0.36 I used:
\begin{verbatim}
ifconfig tunl1 10.0.0.1 pointopoint 193.233.7.65
\end{verbatim}
to create tunnel. It does not work in 2.2.0!
A: You are right, it does not work. The command written above is split to two commands.
\begin{verbatim}
ip tunnel add MY-TUNNEL mode ipip remote 193.233.7.65
\end{verbatim}
will create tunnel device with name \verb|MY-TUNNEL|. Now you may configure
it with:
\begin{verbatim}
ifconfig MY-TUNNEL 10.0.0.1
\end{verbatim}
Certainly, if you prefer name \verb|tunl1| to \verb|MY-TUNNEL|,
you still may use it.
\item
Q: In linux-2.0.36 I used:
\begin{verbatim}
ifconfig tunl0 10.0.0.1
route add -net 10.0.0.0 gw 193.233.7.65 dev tunl0
\end{verbatim}
to tunnel net 10.0.0.0 via router 193.233.7.65. It does not
work in 2.2.0! Moreover, \verb|route| prints a funny error sort of
``network unreachable'' and after this I found a strange direct route
to 10.0.0.0 via \verb|tunl0| in routing table.
A: Yes, in 2.2 the rule that {\em normal} gateway must reside on directly
connected network has not any exceptions. You may tell kernel, that
this particular route is {\em abnormal}:
\begin{verbatim}
ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
ip route add 10.0.0.0/8 via 193.233.7.65 dev tunl0 onlink
\end{verbatim}
Note keyword \verb|onlink|, it is the magic key that orders kernel
not to check for consistency of gateway address.
Probably, after this explanation you have already guessed another method
to cheat kernel:
\begin{verbatim}
ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
route add -host 193.233.7.65 dev tunl0
route add -net 10.0.0.0 netmask 255.0.0.0 gw 193.233.7.65
route del -host 193.233.7.65 dev tunl0
\end{verbatim}
Well, if you like such tricks, nobody may prohibit you to use them.
Only do not forget
that between \verb|route add| and \verb|route del| host 193.233.7.65 is
unreachable.
\item
Q: In 2.0.36 I used to load \verb|tunnel| device module and \verb|ipip| module.
I cannot find any \verb|tunnel| in 2.2!
A: Linux-2.2 has single module \verb|ipip| for both directions of tunneling
and for all IPIP tunnel devices.
\item
Q: \verb|traceroute| does not work over tunnel! Well, stop... It works,
only skips some number of hops.
A: Yes. By default tunnel driver copies \verb|ttl| value from
inner packet to outer one. It means that path traversed by tunneled
packets to another endpoint is not hidden. If you dislike this, or if you
are going to use some routing protocol expecting that packets
with ttl 1 will reach peering host (f.e.\ RIP, OSPF or EBGP)
and you are not afraid of
tunnel loops, you may append option \verb|ttl 64|, when creating tunnel
with \verb|ip tunnel add|.
\item
Q: ... Well, list of things, which 2.0 was able to do finishes.
\end{itemize}
\paragraph{Summary of differences between 2.2 and 2.0.}
\begin{itemize}
\item {\bf In 2.0} you could compile tunnel device into kernel
and got set of 4 devices \verb|tunl0| ... \verb|tunl3| or,
alternatively, compile it as module and load new module
for each new tunnel. Also, module \verb|ipip| was necessary
to receive tunneled packets.
{\bf 2.2} has {\em one\/} module \verb|ipip|. Loading it you get base
tunnel device \verb|tunl0| and another tunnels may be created with command
\verb|ip tunnel add|. These new devices may have arbitrary names.
\item {\bf In 2.0} you set remote tunnel endpoint address with
the command \verb|ifconfig| ... \verb|pointopoint A|.
{\bf In 2.2} this command has the same semantics on all
the interfaces, namely it sets not tunnel endpoint,
but address of peering host, which is directly reachable
via this tunnel,
rather than via Internet. Actual tunnel endpoint address \verb|A|
should be set with \verb|ip tunnel add ... remote A|.
\item {\bf In 2.0} you create tunnel routes with the command:
\begin{verbatim}
route add -net 10.0.0.0 gw A dev tunl0
\end{verbatim}
{\bf 2.2} interprets this command equally for all device
kinds and gateway is required to be directly reachable via this tunnel,
rather than via Internet. You still may use \verb|ip route add ... onlink|
to override this behaviour.
\end{itemize}
\section{Tunnel setup: basics}
Standard Linux-2.2 kernel supports three flavor of tunnels,
listed in the following table:
\vspace{2mm}
\begin{tabular}{lll}
\vrule depth 0.8ex width 0pt\relax
Mode & Description & Base device \\
ipip & IP over IP & tunl0 \\
sit & IPv6 over IP & sit0 \\
gre & ANY over GRE over IP & gre0
\end{tabular}
\vspace{2mm}
\noindent All the kinds of tunnels are created with one command:
\begin{verbatim}
ip tunnel add <NAME> mode <MODE> [ local <S> ] [ remote <D> ]
\end{verbatim}
This command creates new tunnel device with name \verb|<NAME>|.
The \verb|<NAME>| is an arbitrary string. Particularly,
it may be even \verb|eth0|. The rest of parameters set
different tunnel characteristics.
\begin{itemize}
\item
\verb|mode <MODE>| sets tunnel mode. Three modes are available now
\verb|ipip|, \verb|sit| and \verb|gre|.
\item
\verb|remote <D>| sets remote endpoint of the tunnel to IP
address \verb|<D>|.
\item
\verb|local <S>| sets fixed local address for tunneled
packets. It must be an address on another interface of this host.
\end{itemize}
\let\thefootnote\oldthefootnote
Both \verb|remote| and \verb|local| may be omitted. In this case we
say that they are zero or wildcard. Two tunnels of one mode cannot
have the same \verb|remote| and \verb|local|. Particularly it means
that base device or fallback tunnel cannot be replicated.\footnote{
This restriction is relaxed for keyed GRE tunnels.}
Tunnels are divided to two classes: {\bf pointopoint} tunnels, which
have some not wildcard \verb|remote| address and deliver all the packets
to this destination, and {\bf NBMA} (i.e. Non-Broadcast Multi-Access) tunnels,
which have no \verb|remote|. Particularly, base devices (f.e.\ \verb|tunl0|)
are NBMA, because they have neither \verb|remote| nor
\verb|local| addresses.
After tunnel device is created you should configure it as you did
it with another devices. Certainly, the configuration of tunnels has
some features related to the fact that they work over existing Internet
routing infrastructure and simultaneously create new virtual links,
which changes this infrastructure. The danger that not enough careful
tunnel setup will result in formation of tunnel loops,
collapse of routing or flooding network with exponentially
growing number of tunneled fragments is very real.
Protocol setup on pointopoint tunnels does not differ of configuration
of another devices. You should set a protocol address with \verb|ifconfig|
and add routes with \verb|route| utility.
NBMA tunnels are different. To route something via NBMA tunnel
you have to explain to driver, where it should deliver packets to.
The only way to make it is to create special routes with gateway
address pointing to desired endpoint. F.e.\
\begin{verbatim}
ip route add 10.0.0.0/24 via <A> dev tunl0 onlink
\end{verbatim}
It is important to use option \verb|onlink|, otherwise
kernel will refuse request to create route via gateway not directly
reachable over device \verb|tunl0|. With IPv6 the situation is much simpler:
when you start device \verb|sit0|, it automatically configures itself
with all IPv4 addresses mapped to IPv6 space, so that all IPv4
Internet is {\em really reachable} via \verb|sit0|! Excellent, the command
\begin{verbatim}
ip route add 3FFE::/16 via ::193.233.7.65 dev sit0
\end{verbatim}
will route \verb|3FFE::/16| via \verb|sit0|, sending all the packets
destined to this prefix to 193.233.7.65.
\section{Tunnel setup: options}
Command \verb|ip tunnel add| has several additional options.
\begin{itemize}
\item \verb|ttl N| --- set fixed TTL \verb|N| on tunneled packets.
\verb|N| is number in the range 1--255. 0 is special value,
meaning that packets inherit TTL value.
Default value is: \verb|inherit|.
\item \verb|tos T| --- set fixed tos \verb|T| on tunneled packets.
Default value is: \verb|inherit|.
\item \verb|dev DEV| --- bind tunnel to device \verb|DEV|, so that
tunneled packets will be routed only via this device and will
not be able to escape to another device, when route to endpoint changes.
\item \verb|nopmtudisc| --- disable Path MTU Discovery on this tunnel.
It is enabled by default. Note that fixed ttl is incompatible
with this option: tunnels with fixed ttl always make pmtu discovery.
\end{itemize}
\verb|ipip| and \verb|sit| tunnels have no more options. \verb|gre|
tunnels are more complicated:
\begin{itemize}
\item \verb|key K| --- use keyed GRE with key \verb|K|. \verb|K| is
either number or IP address-like dotted quad.
\item \verb|csum| --- checksum tunneled packets.
\item \verb|seq| --- serialize packets.
\begin{NB}
I think this option does not
work. At least, I did not test it, did not debug it and
even do not understand, how it is supposed to work and for what
purpose Cisco planned to use it.
\end{NB}
\end{itemize}
Actually, these GRE options can be set separately for input and
output directions by prefixing corresponding keywords with letter
\verb|i| or \verb|o|. F.e.\ \verb|icsum| orders to accept only
packets with correct checksum and \verb|ocsum| means, that
our host will calculate and send checksum.
Command \verb|ip tunnel add| is not the only operation,
which can be made with tunnels. Certainly, you may get short help page
with:
\begin{verbatim}
ip tunnel help
\end{verbatim}
Besides that, you may view list of installed tunnels with the help of command:
\begin{verbatim}
ip tunnel ls
\end{verbatim}
Also you may look at statistics:
\begin{verbatim}
ip -s tunnel ls Cisco
\end{verbatim}
where \verb|Cisco| is name of tunnel device. Command
\begin{verbatim}
ip tunnel del Cisco
\end{verbatim}
destroys tunnel \verb|Cisco|. And, finally,
\begin{verbatim}
ip tunnel change Cisco mode sit local ME remote HE ttl 32
\end{verbatim}
changes its parameters.
\section{Differences 2.2 and 2.0 tunnels revisited.}
Now we can discuss more subtle differences between tunneling in 2.0
and 2.2.
\begin{itemize}
\item In 2.0 all tunneled packets were received promiscuously
as soon as you loaded module \verb|ipip|. 2.2 tries to select the best
tunnel device and packet looks as received on this. F.e.\ if host
received \verb|ipip| packet from host \verb|D| destined to our
local address \verb|S|, kernel searches for matching tunnels
in order:
\begin{tabular}{ll}
1 & \verb|remote| is \verb|D| and \verb|local| is \verb|S| \\
2 & \verb|remote| is \verb|D| and \verb|local| is wildcard \\
3 & \verb|remote| is wildcard and \verb|local| is \verb|S| \\
4 & \verb|tunl0|
\end{tabular}
If tunnel exists, but it is not in \verb|UP| state, the tunnel is ignored.
Note, that if \verb|tunl0| is \verb|UP| it receives all the IPIP packets,
not acknowledged by more specific tunnels.
Be careful, it means that without carefully installed firewall rules
anyone on the Internet may inject to your network any packets with
source addresses indistinguishable from local ones. It is not so bad idea
to design tunnels in the way enforcing maximal route symmetry
and to enable reversed path filter (\verb|rp_filter| sysctl option) on
tunnel devices.
\item In 2.2 you can monitor and debug tunnels with \verb|tcpdump|.
F.e.\ \verb|tcpdump| \verb|-i Cisco| \verb|-nvv| will dump packets,
which kernel output, via tunnel \verb|Cisco| and the packets received on it
from kernel viewpoint.
\end{itemize}
\section{Linux and Cisco IOS tunnels.}
Among another tunnels Cisco IOS supports IPIP and GRE.
Essentially, Cisco setup is subset of options, available for Linux.
Let us consider the simplest example:
\begin{verbatim}
interface Tunnel0
tunnel mode gre ip
tunnel source 10.10.14.1
tunnel destination 10.10.13.2
\end{verbatim}
This command set translates to:
\begin{verbatim}
ip tunnel add Tunnel0 \
mode gre \
local 10.10.14.1 \
remote 10.10.13.2
\end{verbatim}
Any questions? No questions.
\section{Interaction IPIP tunnels and DVMRP.}
DVMRP exploits IPIP tunnels to route multicasts via Internet.
\verb|mrouted| creates
IPIP tunnels listed in its configuration file automatically.
From kernel and user viewpoints there are no differences between
tunnels, created in this way, and tunnels created by \verb|ip tunnel|.
I.e.\ if \verb|mrouted| created some tunnel, it may be used to
route unicast packets, provided appropriate routes are added.
And vice versa, if administrator has already created a tunnel,
it will be reused by \verb|mrouted|, if it requests DVMRP
tunnel with the same local and remote addresses.
Do not wonder, if your manually configured tunnel is
destroyed, when mrouted exits.
\section{Broadcast GRE ``tunnels''.}
It is possible to set \verb|remote| for GRE tunnel to a multicast
address. Such tunnel becomes {\bf broadcast} tunnel (though word
tunnel is not quite appropriate in this case, it is rather virtual network).
\begin{verbatim}
ip tunnel add Universe local 193.233.7.65 \
remote 224.66.66.66 ttl 16
ip addr add 10.0.0.1/16 dev Universe
ip link set Universe up
\end{verbatim}
This tunnel is true broadcast network and broadcast packets are
sent to multicast group 224.66.66.66. By default such tunnel starts
to resolve both IP and IPv6 addresses via ARP/NDISC, so that
if multicast routing is supported in surrounding network, all GRE nodes
will find one another automatically and will form virtual Ethernet-like
broadcast network. If multicast routing does not work, it is unpleasant
but not fatal flaw. The tunnel becomes NBMA rather than broadcast network.
You may disable dynamic ARPing by:
\begin{verbatim}
echo 0 > /proc/sys/net/ipv4/neigh/Universe/mcast_solicit
\end{verbatim}
and to add required information to ARP tables manually:
\begin{verbatim}
ip neigh add 10.0.0.2 lladdr 128.6.190.2 dev Universe nud permanent
\end{verbatim}
In this case packets sent to 10.0.0.2 will be encapsulated in GRE
and sent to 128.6.190.2. It is possible to facilitate address resolution
using methods typical for another NBMA networks f.e.\ to start user
level \verb|arpd| daemon, which will maintain database of hosts attached
to GRE virtual network or ask for information
dedicated ARP or NHRP server.
Actually, such setup is the most natural for tunneling,
it is really flexible, scalable and easily managable, so that
it is strongly recommended to be used with GRE tunnels instead of ugly
hack with NBMA mode and \verb|onlink| modifier. Unfortunately,
by historical reasons broadcast mode is not supported by IPIP tunnels,
but this probably will change in future.
\section{Traffic control issues.}
Tunnels are devices, hence all the power of Linux traffic control
applies to them. The simplest (and the most useful in practice)
example is limiting tunnel bandwidth. The following command:
\begin{verbatim}
tc qdisc add dev tunl0 root tbf \
rate 128Kbit burst 4K limit 10K
\end{verbatim}
will limit tunneled traffic to 128Kbit with maximal burst size of 4K
and queuing not more than 10K.
However, you should remember, that tunnels are {\em virtual} devices
implemented in software and true queue management is impossible for them
just because they have no queues. Instead, it is better to create classes
on real physical interfaces and to map tunneled packets to them.
In general case of dynamic routing you should create such classes
on all outgoing interfaces, or, alternatively,
to use option \verb|dev DEV| to bind tunnel to a fixed physical device.
In the last case packets will be routed only via specified device
and you need to setup corresponding classes only on it.
Though you have to pay for this convenience,
if routing will change, your tunnel will fail.
Suppose that CBQ class \verb|1:ABC| has been created on device \verb|eth0|
specially for tunnel \verb|Cisco| with endpoints \verb|S| and \verb|D|.
Now you can select IPIP packets with addresses \verb|S| and \verb|D|
with some classifier and map them to class \verb|1:ABC|. F.e.\
it is easy to make with \verb|rsvp| classifier:
\begin{verbatim}
tc filter add dev eth0 pref 100 proto ip rsvp \
session D ipproto ipip filter S \
classid 1:ABC
\end{verbatim}
If you want to make more detailed classification of sub-flows
transmitted via tunnel, you can build CBQ subtree,
rooted at \verb|1:ABC| and attach to subroot set of rules parsing
IPIP packets more deeply.
\end{document}

View File

@ -1,110 +0,0 @@
<!doctype linuxdoc system>
<article>
<title>NSTAT, IFSTAT and RTACCT Utilities
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping
to monitor kernel snmp counters and network interface statistics.
</abstract>
<p> These utilities are very similar, so that I describe
them simultaneously, using name <tt/Xstat/ in the places which apply
to all of them.
<p>The format of the command is:
<tscreen><verb>
Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ]
</verb></tscreen>
<p>
<tt/PATTERN/ is shell style pattern, selecting identifier
of SNMP variables or interfaces to show. Variable is displayed
if one of patterns matches its name. If no patterns are given,
<tt/Xstat/ assumes that user wants to see all the variables.
<p> <tt/OPTIONS/ is list of single letter options, using common unix
conventions.
<itemize>
<item><tt/-h/ - show help page
<item><tt/-?/ - the same, of course
<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit
<item><tt/-z/ - dump zero counters too. By default they are not shown.
<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/
calculates increments since the previous use.
<item><tt/-s/ - do not update history, so that the next time you will
see counters including values accumulated to the moment
of this measurement too.
<item><tt/-n/ - do not display anything, only update history.
<item><tt/-r/ - reset history.
<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting
statistics. <tt/INTERVAL/ is interval between measurements
in seconds.
<item><tt/-t INTERVAL/ - time interval to average rates. Default value
is 60 seconds.
<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only).
</itemize>
<p>
History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt>
or in file given by environment variables <tt/NSTAT_HISTORY/,
<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/.
Each time when you use <tt/Xstat/ values there are updated.
If you use patterns, only the values which you _really_ see
are updated. If you want to skip an unintersting period,
use option <tt/-n/, or just output to <tt>/dev/null</tt>.
<p>
<tt/Xstat/ understands when history is invalidated by system reboot
or source of information switched between different instances
of daemonic <tt/Xstat/ and kernel SNMP tables and does not
use invalid history.
<p> Beware, <tt/Xstat/ will not produce sane output,
when many processes use it simultaneously. If several processes
under single user need this utility they should use environment
variables to put their history in safe places
or to use it with options <tt/-a -s/.
<p>
Well, that's all. The utility is very simple, but nevertheless
very handy.
<p> <bf/Output of XSTAT/
<p> The first line of output is <tt/#/ followed by identifier
of source of information, it may be word <tt/kernel/, when <tt/Xstat/
gets information from kernel or some dotted decimal number followed
by parameters, when it obtains information from running <tt/Xstat/ daemon.
<p>In the case of <tt/nstat/ the rest of output consists of three columns:
SNMP MIB identifier,
its value (or increment since previous measurement) and average
rate of increase of the counter per second. <tt/ifstat/ outputs
interface name followed by pairs of counter and rate of its change.
<p> <bf/Daemonic Xstat/
<p> <tt/Xstat/ may be started as daemon by any user. This makes sense
to avoid wrapped counters and to obtain reasonable long counters
for large time. Also <tt/Xstat/ daemon calculates average rates.
For the first goal sampling interval (option <tt/-d/) may be large enough,
f.e. for gigabit rates byte counters overflow not more frequently than
each 40 seconds and you may select interval of 20 seconds.
From the other hand, when <tt/Xstat/ is used for estimating rates
interval should be less than averaging period (option <tt/-t/), otherwise
estimation loses in quality.
Client <tt/Xstat/, before trying to get information from the kernel,
contacts daemon started by this user, then it tries system wide
daemon, which is supposed to be started by superuser. And only if
none of them replied it gets information from kernel.
<p> <bf/Environment/
<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/.
<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/.
<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/.
</article>

View File

@ -1,26 +0,0 @@
\textwidth 6.0in
\textheight 8.5in
\input SNAPSHOT
\pagestyle{myheadings}
\markboth{\protect\TITLE}{}
\markright{{\protect\sc iproute2-ss\Draft}}
% To print it in compact form: both sides on one sheet (psnup -2)
\evensidemargin=\oddsidemargin
\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB.
}{\par\egroup \vskip 1mm}
\def\threeonly{[2.3.15+ only] }
\begin{document}
\makeatletter
\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}}
\makeatother
\let\oldthefootnote\thefootnote
\def\thefootnote{}
\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov}

View File

@ -1,52 +0,0 @@
<!doctype linuxdoc system>
<article>
<title>RTACCT Utility
<author>Robert Olsson
<date>some_negative_number, 20 Dec 2001
<p>
Here is some code for monitoring the route cache. For systems handling high
network load, servers, routers, firewalls etc the route cache and its garbage
collection is crucial. Linux has a solid implementation.
<p>
The kernel patch (not required since linux-2.4.7) adds statistics counters
from route cache process into
/proc/net/rt_cache_stat. A companion user mode program presents the statistics
in a vmstat or iostat manner. The ratio between cache hits and misses gives
the flow length.
<p>
Hopefully it can help understanding performance and DoS and other related
issues.
<p> An URL where newer versions of this utility can be (probably) found
is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/
<p><bf/Description/
<p>The format of the command is:
<tscreen><verb>
rtstat [ OPTIONS ]
</verb></tscreen>
<p> <tt/OPTIONS/ are:
<itemize>
<item><tt/-h/, <tt/-help/ - show help page and version of the utility.
<item><tt/-i INTERVAL/ - interval between snapshots, default value is
2 seconds.
<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line,
1 prescribes to print it once and 2 (this is default setting) forces header
line each 20 lines.
</itemize>
</article>

View File

@ -1,525 +0,0 @@
<!doctype linuxdoc system>
<article>
<title>SS Utility: Quick Intro
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/ss/ is one another utility to investigate sockets.
Functionally it is NOT better than <tt/netstat/ combined
with some perl/awk scripts and though it is surely faster
it is not enough to make it much better. :-)
So, stop reading this now and do not waste your time.
Well, certainly, it proposes some functionality, which current
netstat is still not able to do, but surely will soon.
</abstract>
<sect>Why?
<p> <tt>/proc</tt> interface is inadequate, unfortunately.
When amount of sockets is enough large, <tt/netstat/ or even
plain <tt>cat /proc/net/tcp/</tt> cause nothing but pains and curses.
In linux-2.4 the desease became worse: even if amount
of sockets is small reading <tt>/proc/net/tcp/</tt> is slow enough.
This utility presents a new approach, which is supposed to scale
well. I am not going to describe technical details here and
will concentrate on description of the command.
The only important thing to say is that it is not so bad idea
to load module <tt/tcp_diag/, which can be found in directory
<tt/Modules/ of <tt/iproute2/. If you do not make this <tt/ss/
will work, but it falls back to <tt>/proc</tt> and becomes slow
like <tt/netstat/, well, a bit faster yet (see section "Some numbers").
<sect>Old news
<p>
In the simplest form <tt/ss/ is equivalent to netstat
with some small deviations.
<itemize>
<item><tt/ss -t -a/ dumps all TCP sockets
<item><tt/ss -u -a/ dumps all UDP sockets
<item><tt/ss -w -a/ dumps all RAW sockets
<item><tt/ss -x -a/ dumps all UNIX sockets
</itemize>
<p>
Option <tt/-o/ shows TCP timers state.
Option <tt/-e/ shows some extended information.
Etc. etc. etc. Seems, all the options of netstat related to sockets
are supported. Though not AX.25 and other bizarres. :-)
If someone wants, he can make support for decnet and ipx.
Some rudimentary support for them is already present in iproute2 libutils,
and I will be glad to see these new members.
<p>
However, standard functionality is a bit different:
<p>
The first: without option <tt/-a/ sockets in states
<tt/TIME-WAIT/ and <tt/SYN-RECV/ are skipped too.
It is more reasonable default, I think.
<p>
The second: format of UNIX sockets is different. It coincides
with tcp/udp. Though standard kernel still does not allow to
see write/read queues and peer address of connected UNIX sockets,
the patch doing this exists.
<p>
The third: default is to dump only TCP sockets, rather than all of the types.
<p>
The next: by default it does not resolve numeric host addresses (like <tt/ip/)!
Resolving is enabled with option <tt/-r/. Service names, usually stored
in local files, are resolved by default. Also, if service database
does not contain references to a port, <tt/ss/ queries system
<tt/rpcbind/. RPC services are prefixed with <tt/rpc./
Resolution of services may be suppressed with option <tt/-n/.
<p>
It does not accept "long" options (I dislike them, sorry).
So, address family is given with family identifier following
option <tt/-f/ to be algined to iproute2 conventions.
Mostly, it is to allow option parser to parse
addresses correctly, but as side effect it really limits dumping
to sockets supporting only given family. Option <tt/-A/ followed
by list of socket tables to dump is also supported.
Logically, id of socket table is different of _address_ family, which is
another point of incompatibility. So, id is one of
<tt/all/, <tt/tcp/, <tt/udp/,
<tt/raw/, <tt/inet/, <tt/unix/, <tt/packet/, <tt/netlink/. See?
Well, <tt/inet/ is just abbreviation for <tt/tcp|udp|raw/
and it is not difficult to guess that <tt/packet/ allows
to look at packet sockets. Actually, there are also some other abbreviations,
f.e. <tt/unix_dgram/ selects only datagram UNIX sockets.
<p>
The next: well, I still do not know. :-)
<sect>Time to talk about new functionality.
<p>It is builtin filtering of socket lists.
<sect1> Filtering by state.
<p>
<tt/ss/ allows to filter socket states, using keywords
<tt/state/ and <tt/exclude/, followed by some state
identifier.
<p>
State identifier are standard TCP state names (not listed,
they are useless for you if you already do not know them)
or abbreviations:
<itemize>
<item><tt/all/ - for all the states
<item><tt/bucket/ - for TCP minisockets (<tt/TIME-WAIT|SYN-RECV/)
<item><tt/big/ - all except for minisockets
<item><tt/connected/ - not closed and not listening
<item><tt/synchronized/ - connected and not <tt/SYN-SENT/
</itemize>
<p>
F.e. to dump all tcp sockets except <tt/SYN-RECV/:
<tscreen><verb>
ss exclude SYN-RECV
</verb></tscreen>
<p>
If neither <tt/state/ nor <tt/exclude/ directives
are present,
state filter defaults to <tt/all/ with option <tt/-a/
or to <tt/all/,
excluding listening, syn-recv, time-wait and closed sockets.
<sect1> Filtering by addresses and ports.
<p>
Option list may contain address/port filter.
It is boolean expression which consists of boolean operation
<tt/or/, <tt/and/, <tt/not/ and predicates.
Actually, all the flavors of names for boolean operations are eaten:
<tt/&amp/, <tt/&amp&amp/, <tt/|/, <tt/||/, <tt/!/, but do not forget
about special sense given to these symbols by unix shells and escape
them correctly, when used from command line.
<p>
Predicates may be of the folowing kinds:
<itemize>
<item>A. Address/port match, where address is checked against mask
and port is either wildcard or exact. It is one of:
<tscreen><verb>
dst prefix:port
src prefix:port
src unix:STRING
src link:protocol:ifindex
src nl:channel:pid
</verb></tscreen>
Both prefix and port may be absent or replaced with <tt/*/,
which means wildcard. UNIX socket use more powerful scheme
matching to socket names by shell wildcards. Also, prefixes
unix: and link: may be omitted, if address family is evident
from context (with option <tt/-x/ or with <tt/-f unix/
or with <tt/unix/ keyword)
<p>
F.e.
<tscreen><verb>
dst 10.0.0.1
dst 10.0.0.1:
dst 10.0.0.1/32:
dst 10.0.0.1:*
</verb></tscreen>
are equivalent and mean socket connected to
any port on host 10.0.0.1
<tscreen><verb>
dst 10.0.0.0/24:22
</verb></tscreen>
sockets connected to port 22 on network
10.0.0.0...255.
<p>
Note that port separated of address with colon, which creates
troubles with IPv6 addresses. Generally, we interpret the last
colon as splitting port. To allow to give IPv6 addresses,
trick like used in IPv6 HTTP URLs may be used:
<tscreen><verb>
dst [::1]
</verb></tscreen>
are sockets connected to ::1 on any port
<p>
Another way is <tt/dst ::1/128/. / helps to understand that
colon is part of IPv6 address.
<p>
Now we can add another alias for <tt/dst 10.0.0.1/:
<tt/dst [10.0.0.1]/. :-)
<p> Address may be a DNS name. In this case all the addresses are looked
up (in all the address families, if it is not limited by option <tt/-f/
or special address prefix <tt/inet:/, <tt/inet6/) and resulting
expression is <tt/or/ over all of them.
<item> B. Port expressions:
<tscreen><verb>
dport &gt= :1024
dport != :22
sport &lt :32000
</verb></tscreen>
etc.
All the relations: <tt/&lt/, <tt/&gt/, <tt/=/, <tt/>=/, <tt/=/, <tt/==/,
<tt/!=/, <tt/eq/, <tt/ge/, <tt/lt/, <tt/ne/...
Use variant which you like more, but not forget to escape special
characters when typing them in command line. :-)
Note that port number syntactically coincides to the case A!
You may even add an IP address, but it will not participate
incomparison, except for <tt/==/ and <tt/!=/, which are equivalent
to corresponding predicates of type A. F.e.
<p>
<tt/dst 10.0.0.1:22/
is equivalent to <tt/dport eq 10.0.0.1:22/
and
<tt/not dst 10.0.0.1:22/ is equivalent to
<tt/dport neq 10.0.0.1:22/
<item>C. Keyword <tt/autobound/. It matches to sockets bound automatically
on local system.
</itemize>
<sect> Examples
<p>
<itemize>
<item>1. List all the tcp sockets in state <tt/FIN-WAIT-1/ for our apache
to network 193.233.7/24 and look at their timers:
<tscreen><verb>
ss -o state fin-wait-1 \( sport = :http or sport = :https \) \
dst 193.233.7/24
</verb></tscreen>
Oops, forgot to say that missing logical operation is
equivalent to <tt/and/.
<item> 2. Well, now look at the rest...
<tscreen><verb>
ss -o excl fin-wait-1
ss state fin-wait-1 \( sport neq :http and sport neq :https \) \
or not dst 193.233.7/24
</verb></tscreen>
Note that we have to do _two_ calls of ss to do this.
State match is always anded to address/port match.
The reason for this is purely technical: ss does fast skip of
not matching states before parsing addresses and I consider the
ability to skip fastly gobs of time-wait and syn-recv sockets
as more important than logical generality.
<item> 3. So, let's look at all our sockets using autobound ports:
<tscreen><verb>
ss -a -A all autobound
</verb></tscreen>
<item> 4. And eventually find all the local processes connected
to local X servers:
<tscreen><verb>
ss -xp dst "/tmp/.X11-unix/*"
</verb></tscreen>
Pardon, this does not work with current kernel, patching is required.
But we still can look at server side:
<tscreen><verb>
ss -x src "/tmp/.X11-unix/*"
</verb></tscreen>
</itemize>
<sect> Returning to ground: real manual
<p>
<sect1> Command arguments
<p> General format of arguments to <tt/ss/ is:
<tscreen><verb>
ss [ OPTIONS ] [ STATE-FILTER ] [ ADDRESS-FILTER ]
</verb></tscreen>
<sect2><tt/OPTIONS/
<p> <tt/OPTIONS/ is list of single letter options, using common unix
conventions.
<itemize>
<item><tt/-h/ - show help page
<item><tt/-?/ - the same, of course
<item><tt/-v/, <tt/-V/ - print version of <tt/ss/ and exit
<item><tt/-s/ - print summary statistics. This option does not parse
socket lists obtaining summary from various sources. It is useful
when amount of sockets is so huge that parsing <tt>/proc/net/tcp</tt>
is painful.
<item><tt/-D FILE/ - do not display anything, just dump raw information
about TCP sockets to <tt/FILE/ after applying filters. If <tt/FILE/ is <tt/-/
<tt/stdout/ is used.
<item><tt/-F FILE/ - read continuation of filter from <tt/FILE/.
Each line of <tt/FILE/ is interpreted like single command line option.
If <tt/FILE/ is <tt/-/ <tt/stdin/ is used.
<item><tt/-r/ - try to resolve numeric address/ports
<item><tt/-n/ - do not try to resolve ports
<item><tt/-o/ - show some optional information, f.e. TCP timers
<item><tt/-i/ - show some infomration specific to TCP (RTO, congestion
window, slow start threshould etc.)
<item><tt/-e/ - show even more optional information
<item><tt/-m/ - show extended information on memory used by the socket.
It is available only with <tt/tcp_diag/ enabled.
<item><tt/-p/ - show list of processes owning the socket
<item><tt/-f FAMILY/ - default address family used for parsing addresses.
Also this option limits listing to sockets supporting
given address family. Currently the following families
are supported: <tt/unix/, <tt/inet/, <tt/inet6/, <tt/link/,
<tt/netlink/.
<item><tt/-4/ - alias for <tt/-f inet/
<item><tt/-6/ - alias for <tt/-f inet6/
<item><tt/-0/ - alias for <tt/-f link/
<item><tt/-A LIST-OF-TABLES/ - list of socket tables to dump, separated
by commas. The following identifiers are understood:
<tt/all/, <tt/inet/, <tt/tcp/, <tt/udp/, <tt/raw/,
<tt/unix/, <tt/packet/, <tt/netlink/, <tt/unix_dgram/,
<tt/unix_stream/, <tt/packet_raw/, <tt/packet_dgram/.
<item><tt/-x/ - alias for <tt/-A unix/
<item><tt/-t/ - alias for <tt/-A tcp/
<item><tt/-u/ - alias for <tt/-A udp/
<item><tt/-w/ - alias for <tt/-A raw/
<item><tt/-a/ - show sockets of all the states. By default sockets
in states <tt/LISTEN/, <tt/TIME-WAIT/, <tt/SYN_RECV/
and <tt/CLOSE/ are skipped.
<item><tt/-l/ - show only sockets in state <tt/LISTEN/
</itemize>
<sect2><tt/STATE-FILTER/
<p><tt/STATE-FILTER/ allows to construct arbitrary set of
states to match. Its syntax is sequence of keywords <tt/state/
and <tt/exclude/ followed by identifier of state.
Available identifiers are:
<p>
<itemize>
<item> All standard TCP states: <tt/established/, <tt/syn-sent/,
<tt/syn-recv/, <tt/fin-wait-1/, <tt/fin-wait-2/, <tt/time-wait/,
<tt/closed/, <tt/close-wait/, <tt/last-ack/, <tt/listen/ and <tt/closing/.
<item><tt/all/ - for all the states
<item><tt/connected/ - all the states except for <tt/listen/ and <tt/closed/
<item><tt/synchronized/ - all the <tt/connected/ states except for
<tt/syn-sent/
<item><tt/bucket/ - states, which are maintained as minisockets, i.e.
<tt/time-wait/ and <tt/syn-recv/.
<item><tt/big/ - opposite to <tt/bucket/
</itemize>
<sect2><tt/ADDRESS_FILTER/
<p><tt/ADDRESS_FILTER/ is boolean expression with operations <tt/and/, <tt/or/
and <tt/not/, which can be abbreviated in C style f.e. as <tt/&amp/,
<tt/&amp&amp/.
<p>
Predicates check socket addresses, both local and remote.
There are the following kinds of predicates:
<itemize>
<item> <tt/dst ADDRESS_PATTERN/ - matches remote address and port
<item> <tt/src ADDRESS_PATTERN/ - matches local address and port
<item> <tt/dport RELOP PORT/ - compares remote port to a number
<item> <tt/sport RELOP PORT/ - compares local port to a number
<item> <tt/autobound/ - checks that socket is bound to an ephemeral
port
</itemize>
<p><tt/RELOP/ is some of <tt/&lt=/, <tt/&gt=/, <tt/==/ etc.
To make this more convinient for use in unix shell, alphabetic
FORTRAN-like notations <tt/le/, <tt/gt/ etc. are accepted as well.
<p>The format and semantics of <tt/ADDRESS_PATTERN/ depends on address
family.
<itemize>
<item><tt/inet/ - <tt/ADDRESS_PATTERN/ consists of IP prefix, optionally
followed by colon and port. If prefix or port part is absent or replaced
with <tt/*/, this means wildcard match.
<item><tt/inet6/ - The same as <tt/inet/, only prefix refers to an IPv6
address. Unlike <tt/inet/ colon becomes ambiguous, so that <tt/ss/ allows
to use scheme, like used in URLs, where address is suppounded with
<tt/[/ ... <tt/]/.
<item><tt/unix/ - <tt/ADDRESS_PATTERN/ is shell-style wildcard.
<item><tt/packet/ - format looks like <tt/inet/, only interface index
stays instead of port and link layer protocol id instead of address.
<item><tt/netlink/ - format looks like <tt/inet/, only socket pid
stays instead of port and netlink channel instead of address.
</itemize>
<p><tt/PORT/ is syntactically <tt/ADDRESS_PATTERN/ with wildcard
address part. Certainly, it is undefined for UNIX sockets.
<sect1> Environment variables
<p>
<tt/ss/ allows to change source of information using various
environment variables:
<p>
<itemize>
<item> <tt/PROC_SLABINFO/ to override <tt>/proc/slabinfo</tt>
<item> <tt/PROC_NET_TCP/ to override <tt>/proc/net/tcp</tt>
<item> <tt/PROC_NET_UDP/ to override <tt>/proc/net/udp</tt>
<item> etc.
</itemize>
<p>
Variable <tt/PROC_ROOT/ allows to change root of all the <tt>/proc/</tt>
hierarchy.
<p>
Variable <tt/TCPDIAG_FILE/ prescribes to open a file instead of
requesting kernel to dump information about TCP sockets.
<p> This option is used mainly to investigate bug reports,
when dumps of files usually found in <tt>/proc/</tt> are recevied
by e-mail.
<sect1> Output format
<p>Six columns. The first is <tt/Netid/, it denotes socket type and
transport protocol, when it is ambiguous: <tt/tcp/, <tt/udp/, <tt/raw/,
<tt/u_str/ is abbreviation for <tt/unix_stream/, <tt/u_dgr/ for UNIX
datagram sockets, <tt/nl/ for netlink, <tt/p_raw/ and <tt/p_dgr/ for
raw and datagram packet sockets. This column is optional, it will
be hidden, if filter selects an unique netid.
<p>
The second column is <tt/State/. Socket state is displayed here.
The names are standard TCP names, except for <tt/UNCONN/, which
cannot happen for TCP, but normal for not connected sockets
of another types. Again, this column can be hidden.
<p>
Then two columns (<tt/Recv-Q/ and <tt/Send-Q/) showing amount of data
queued for receive and transmit.
<p>
And the last two columns display local address and port of the socket
and its peer address, if the socket is connected.
<p>
If options <tt/-o/, <tt/-e/ or <tt/-p/ were given, options are
displayed not in fixed positions but separated by spaces pairs:
<tt/option:value/. If value is not a single number, it is presented
as list of values, enclosed to <tt/(/ ... <tt/)/ and separated with
commas. F.e.
<tscreen><verb>
timer:(keepalive,111min,0)
</verb></tscreen>
is typical format for TCP timer (option <tt/-o/).
<tscreen><verb>
users:((X,113,3))
</verb></tscreen>
is typical for list of users (option <tt/-p/).
<sect>Some numbers
<p>
Well, let us use <tt/pidentd/ and a tool <tt/ibench/ to measure
its performance. It is 30 requests per second here. Nothing to test,
it is too slow. OK, let us patch pidentd with patch from directory
Patches. After this it handles about 4300 requests per second
and becomes handy tool to pollute socket tables with lots of timewait
buckets.
<p>
So, each test starts from pollution tables with 30000 sockets
and then doing full dump of the table piped to wc and measuring
timings with time:
<p>Results:
<itemize>
<item> <tt/netstat -at/ - 15.6 seconds
<item> <tt/ss -atr/, but without <tt/tcp_diag/ - 5.4 seconds
<item> <tt/ss -atr/ with <tt/tcp_diag/ - 0.47 seconds
</itemize>
No comments. Though one comment is necessary, most of time
without <tt/tcp_diag/ is wasted inside kernel with completely
blocked networking. More than 10 seconds, yes. <tt/tcp_diag/
does the same work for 100 milliseconds of system time.
</article>

View File

@ -1,514 +0,0 @@
\documentclass[12pt,twoside]{article}
\usepackage[hidelinks]{hyperref} % \url
\usepackage{booktabs} % nicer tabulars
\usepackage{fancyvrb}
\usepackage{fullpage}
\usepackage{float}
\newcommand{\iface}{\textit}
\newcommand{\cmd}{\texttt}
\newcommand{\man}{\textit}
\newcommand{\qdisc}{\texttt}
\newcommand{\filter}{\texttt}
\begin{document}
\title{QoS in Linux with TC and Filters}
\author{Phil Sutter (phil@nwl.cc)}
\date{January 2016}
\maketitle
Standard practice when transmitting packets over a medium which may block (due
to congestion, e.g.) is to use a queue which temporarily holds these packets. In
Linux, this queueing approach is where QoS happens: A Queueing Discipline
(qdisc) holds multiple packet queues with different priorities for dequeueing to
the network driver. The classification (i.e. deciding which queue a packet
should go into) is typically done based on Type Of Service (IPv4) or Traffic
Class (IPv6) header fields but depending on qdisc implementation, might be
controlled by the user as well.
Qdiscs come in two flavors, classful or classless. While classless qdiscs are
not as flexible as classful ones, they also require much less customizing. Often
it is enough to just attach them to an interface, without exact knowledge of
what is done internally. Classful qdiscs are the exact opposite: flexible in
application, they are often not even usable without insightful configuration.
As the name implies, classful qdiscs provide configurable classes to sort
traffic into. In it's basic form, this is not much different than, say, the
classless \qdisc{pfifo\_fast} which holds three queues and classifies per
packet upon priority field. Though typically classes go beyond that by
supporting nesting and additional characteristics like e.g. maximum traffic
rate or quantum.
When it comes to controlling the classification process, filters come into play.
They attach to the parent of a set of classes (i.e. either the qdisc itself or
a parent class) and specify how a packet (or it's associated flow) has to look
like in order to suit a given class. To overcome this simplification, it is
possible to attach multiple filters to the same parent, which then consults each
of them in row until the first one accepts the packet.
Before getting into detail about what filters there are and how to use them, a
simple setup of a qdisc with classes is necessary:
\begin{figure}[H]
\begin{Verbatim}
.-------------------------------------------------------.
| |
| HTB |
| |
| .----------------------------------------------------.|
| | ||
| | Class 1:1 ||
| | ||
| | .---------------..---------------..---------------.||
| | | || || |||
| | | Class 1:10 || Class 1:20 || Class 1:30 |||
| | | || || |||
| | | .------------.|| .------------.|| .------------.|||
| | | | ||| | ||| | ||||
| | | | fq_codel ||| | fq_codel ||| | fq_codel ||||
| | | | ||| | ||| | ||||
| | | '------------'|| '------------'|| '------------'|||
| | '---------------''---------------''---------------'||
| '----------------------------------------------------'|
'-------------------------------------------------------'
\end{Verbatim}
\end{figure}
\noindent
The following commands establish the basic setup shown:
\begin{Verbatim}
(1) # tc qdisc replace dev eth0 root handle 1: htb default 30
(2) # tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit
(3) # alias tclass='tc class add dev eth0 parent 1:1'
(4) # tclass classid 1:10 htb rate 1mbit ceil 20mbit prio 1
(4) # tclass classid 1:20 htb rate 90mbit ceil 95mbit prio 2
(4) # tclass classid 1:30 htb rate 1mbit ceil 95mbit prio 3
(5) # tc qdisc add dev eth0 parent 1:10 fq_codel
(5) # tc qdisc add dev eth0 parent 1:20 fq_codel
(5) # tc qdisc add dev eth0 parent 1:30 fq_codel
\end{Verbatim}
A little explanation for the unfamiliar reader:
\begin{enumerate}
\item Replace the root qdisc of \iface{eth0} by an instance of \qdisc{HTB}.
Specifying the handle is necessary so it can be referenced in consecutive
calls to \cmd{tc}. The default class for unclassified traffic is set to
30.
\item Create a single top-level class with handle 1:1 which limits the total
bandwidth allowed to 95mbit/s. It is assumed that \iface{eth0} is a 100mbit/s link,
staying a little below that helps to keep the main point of enqueueing in
the qdisc layer instead of the interface hardware queue or at another
bottleneck in the network.
\item Define an alias for the common part of the remaining three calls in order
to improve readability. This means all remaining classes are attached to the
common parent class from (2).
\item Create three child classes for different uses: Class 1:10 has highest
priority but is tightly limited in bandwidth - fine for interactive
connections. Class 1:20 has mid priority and high guaranteed bandwidth, for
high priority bulk traffic. Finally, there's the default class 1:30 with
lowest priority, low guaranteed bandwidth and the ability to use the full
link in case it's unused otherwise. This should be fine for uninteresting
traffic not explicitly taken care of.
\item Attach a leaf qdisc to each of the child classes created in (4). Since
\qdisc{HTB} by default attaches \qdisc{pfifo} as leaf qdisc, this step is optional. Still,
the fairness between different flows provided by the classless \qdisc{fq\_codel} is
worth the effort.
\end{enumerate}
More information about the qdiscs and fine-tuning parameters can be found in
\man{tc-htb(8)} and \man{tc-fq\_codel(8)}.
Without any additional setup done, now all traffic leaving \iface{eth0} is shaped to
95mbit/s and directed through class 1:30. This can be verified by looking at the
\texttt{Sent} field of the class statistics printed via \cmd{tc -s class show dev eth0}:
Only the root class 1:1 and it's child 1:30 should show any traffic.
\section*{Finally time to start filtering!}
Let's begin with a simple one, i.e. reestablishing what \qdisc{pfifo\_fast} did
automatically based on TOS/Priority field. Linux internally translates the
header field into the priority field of struct skbuff, which
\qdisc{pfifo\_fast} uses for
classification. \man{tc-prio(8)} contains a table listing the priority (and
ultimately, \qdisc{pfifo\_fast} queue index) each TOS value is being translated into.
Here is a shorter version:
\begin{center}
\begin{tabular}{lll}
TOS Values & Linux Priority (Number) & Queue Index \\
\midrule
0x0 - 0x6 & Best Effort (0) & 1 \\
0x8 - 0xe & Bulk (2) & 2 \\
0x10 - 0x16 & Interactive (6) & 0 \\
0x18 - 0x1e & Interactive Bulk (4) & 1 \\
\end{tabular}
\end{center}
Using the \filter{basic} filter, it is possible to match packets based on that skbuff
field, which has the added benefit of being IP version agnostic. Since the
\qdisc{HTB} setup above defaults to class ID 1:30, the Bulk priority can be
ignored. The \filter{basic} filter allows to combine matches, therefore we get along
with only two filters:
\begin{Verbatim}
# tc filter add dev eth0 parent 1: basic \
match 'meta(priority eq 6)' classid 1:10
# tc filter add dev eth0 parent 1: basic \
match 'meta(priority eq 0)' \
or 'meta(priority eq 4)' classid 1:20
\end{Verbatim}
A detailed description of the \filter{basic} filter and the ematch syntax it uses can be
found in \man{tc-basic(8)} and \man{tc-ematch(8)}.
Obviously, this first example cries for optimization. A simple one would be to
just change the default class from 1:30 to 1:20, so filters are only needed for
Bulk and Interactive priorities:
\begin{Verbatim}
# tc filter add dev eth0 parent 1: basic \
match 'meta(priority eq 6)' classid 1:10
# tc filter add dev eth0 parent 1: basic \
match 'meta(priority eq 2)' classid 1:20
\end{Verbatim}
Given that class IDs are random, choosing them wisely allows for a direct
mapping. So first, recreate the qdisc and classes configuration:
\begin{Verbatim}
# tc qdisc replace dev eth0 root handle 1: htb default 10
# tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit
# alias tclass='tc class add dev eth0 parent 1:1'
# tclass classid 1:16 htb rate 1mbit ceil 20mbit prio 1
# tclass classid 1:10 htb rate 90mbit ceil 95mbit prio 2
# tclass classid 1:12 htb rate 1mbit ceil 95mbit prio 3
# tc qdisc add dev eth0 parent 1:16 fq_codel
# tc qdisc add dev eth0 parent 1:10 fq_codel
# tc qdisc add dev eth0 parent 1:12 fq_codel
\end{Verbatim}
This is basically identical to above, but with changed leaf class IDs and the
second priority class being the default. Using the \filter{flow} filter with it's \texttt{map}
functionality, a single filter command is enough:
\begin{Verbatim}
# tc filter add dev eth0 parent 1: handle 0x1337 flow \
map key priority baseclass 1:10
\end{Verbatim}
The \filter{flow} filter now uses the priority value to construct a destination class ID
by adding it to the value of \texttt{baseclass}. While this works for priority values of
0, 2 and 6, it will result in non-existent class ID 1:14 for Interactive Bulk
traffic. In that case, the \qdisc{HTB} default applies so that traffic goes into class
ID 1:10 just as intended. Please note that specifying a handle is a mandatory
requirement by the \filter{flow} filter, although I didn't see where one would use that
later. For more information about \filter{flow}, see \man{tc-flow(8)}.
While \filter{flow} and \filter{basic} filters are relatively easy to apply and understand, they
are as well quite limited to their intended purpose. A more flexible option is
the \filter{u32} filter, which allows to match on arbitrary parts of the packet data -
yet only on that, not any meta data associated to it by the kernel (with the
exception of firewall mark value). So in order to continue this little
exercise with \filter{u32}, we have to base classification directly upon the actual TOS
value. An intuitive attempt might look like this:
\begin{Verbatim}
# alias tcfilter='tc filter add dev eth0 parent 1:'
# tcfilter u32 match ip dsfield 0x10 0x1e classid 1:16
# tcfilter u32 match ip dsfield 0x12 0x1e classid 1:16
# tcfilter u32 match ip dsfield 0x14 0x1e classid 1:16
# tcfilter u32 match ip dsfield 0x16 0x1e classid 1:16
# tcfilter u32 match ip dsfield 0x8 0x1e classid 1:12
# tcfilter u32 match ip dsfield 0xa 0x1e classid 1:12
# tcfilter u32 match ip dsfield 0xc 0x1e classid 1:12
# tcfilter u32 match ip dsfield 0xe 0x1e classid 1:12
\end{Verbatim}
The obvious drawback here is the amount of filters needed. And without the
default class, eight more filters would be necessary. This also has performance
implications: A packet with TOS value 0xe will be checked eight times in total
in order to determine it's destination class. While there's not much to be done
about the number of filters, at least the performance problem can be eliminated
by using \filter{u32}'s hash table support:
\begin{Verbatim}
# tc filter add dev eth0 parent 1: prio 99 handle 1: u32 divisor 16
\end{Verbatim}
This creates a hash table with 16 buckets. The table size is arbitrary, but not
random: Since the first bit of the TOS field is not interesting, it can be
ignored and therefore the range of values to consider is just [0;15], i.e. a
number of 16 different values. The next step is to populate the hash table:
\begin{Verbatim}
# alias tcfilter='tc filter add dev eth0 parent 1: prio 99'
# tcfilter u32 match u8 0 0 ht 1:0: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:1: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:2: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:3: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:4: classid 1:12
# tcfilter u32 match u8 0 0 ht 1:5: classid 1:12
# tcfilter u32 match u8 0 0 ht 1:6: classid 1:12
# tcfilter u32 match u8 0 0 ht 1:7: classid 1:12
# tcfilter u32 match u8 0 0 ht 1:8: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:9: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:a: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:b: classid 1:16
# tcfilter u32 match u8 0 0 ht 1:c: classid 1:10
# tcfilter u32 match u8 0 0 ht 1:d: classid 1:10
# tcfilter u32 match u8 0 0 ht 1:e: classid 1:10
# tcfilter u32 match u8 0 0 ht 1:f: classid 1:10
\end{Verbatim}
The parameter \texttt{ht} denotes the hash table and bucket the filter should be added
to. Since the first TOS bit is ignored, it's value has to be divided by two in
order to get to the bucket it maps to. E.g. a TOS value of 0x10 will therefore
map to bucket 0x8. For the sake of completeness, all possible values are mapped
and therefore a configurable default class is not required. Note that the used
match expression is not necessary, but mandatory. Therefore anything that
matches any packet will suffice. Finally, a filter which links to the defined
hash table is needed:
\begin{Verbatim}
# tc filter add dev eth0 parent 1: prio 1 protocol ip u32 \
link 1: hashkey mask 0x001e0000 match u8 0 0
\end{Verbatim}
Here again, the actual match statement is not necessary, but syntactically
required. All the magic lies within the \texttt{hashkey} parameter, which defines which
part of the packet should be used directly as hash key. Here's a drawing of the
first four bytes of the IPv4 header, with the area selected by \texttt{hashkey mask}
highlighted:
\begin{figure}[H]
\begin{Verbatim}
0 1 2 3
.-----------------------------------------------------------------.
| | | ######## | | |
| Version| IHL | #DSCP### | ECN| Total Length |
| | | ######## | | |
'-----------------------------------------------------------------'
\end{Verbatim}
\end{figure}
\noindent
This may look confusing at first, but keep in mind that bit- as well as
byte-ordering here is LSB while the mask value is written in MSB we humans use.
Therefore reading the mask is done like so, starting from left:
\begin{enumerate}
\item Skip the first byte (which contains Version and IHL fields).
\item Skip the lowest bit of the second byte (0x1e is even).
\item Mark the four following bits (0x1e is 11110 in binary).
\item Skip the remaining three bits of the second byte as well as the remaining two
bytes.
\end{enumerate}
Before doing the lookup, the kernel right-shifts the masked value by the amount
of zero-bits in \texttt{mask}, which implicitly also does the division by two which the
hash table depends on. With this setup, every packet has to pass exactly two
filters to be classified. Note that this filter is limited to IPv4 packets: Due
to the related Traffic Class field being at a different offset in the packet, it
would not work for IPv6. To use the same setup for IPv6 as well, a second
entry-level filter is necessary:
\begin{Verbatim}
# tc filter add dev eth0 parent 1: prio 2 protocol ipv6 u32 \
link 1: hashkey mask 0x01e00000 match u8 0 0
\end{Verbatim}
For illustration purposes, here again is a drawing of the first four bytes of
the IPv6 header, again with masked area highlighted:
\begin{figure}[H]
\begin{Verbatim}
0 1 2 3
.-----------------------------------------------------------------.
| | ######## | |
| Version| #Traffic Class| Flow Label |
| | ######## | |
'-----------------------------------------------------------------'
\end{Verbatim}
\end{figure}
\noindent
Reading the mask value is analogous to IPv4 with the added complexity that
Traffic Class spans over two bytes. Yet, for comparison there's a simple trick:
IPv6 has the interesting field shifted by four bits to the left, and the new
mask's value is shifted by the same amount. For further information about
\filter{u32} and what can be done with it, consult it's man page
\man{tc-u32(8)}.
Of course, the kernel provides many more filters than just \filter{basic},
\filter{flow} and \filter{u32} which have been presented above. As of now, the
remaining ones are:
\begin{description}
\item[bpf]
Filtering using Berkeley Packet Filter programs. The program's return
code determines the packet's destination class ID.
\item[cgroup]
Filter packets based on control groups. This is only useful for packets
originating from the local host, as control groups only exist in that
scope.
\item[flower]
An extended variant of the flow filter.
\item[fw]
Matches on firewall mark values previously assigned to the packet by
netfilter (or a filter action, see below for details). This allows to
export the classification algorithm into netfilter, which is very
convenient if appropriate rules exist on the same system in there
already.
\item[route]
Filter packets based on matching routing table entry. Basically
equivalent to the \texttt{fw} filter above, to make use of an already existing
extensive routing table setup.
\item[rsvp, rsvp6]
Implementation of the Resource Reservation Protocol in Linux, to react
upon requests sent by an RSVP daemon.
\item[tcindex]
Match packets based on tcindex value, which is usually set by the dsmark
qdisc. This is part of an approach to support Differentiated Services in
Linux, which is another topic on it's own.
\end{description}
\section*{Filter Actions}
The tc filter framework provides the infrastructure to another extensible set of
tools as well, namely tc actions. As the name suggests, they allow to do things
with packets (or associated data). (The list of) Actions are part of a given
filter. If it matches, each action it contains is executed in order before
returning the classification result. Since the action has direct access to the
latter, it is in theory possible for an action to react upon or even change the
filtering result - as long as the packet matched, of course. Yet none of the
currently in-tree actions make use of this.
The Generic Actions framework originally evolved out of the filters' ability to
police traffic to a given maximum bandwidth. One common use case for that is to
limit ingress traffic, dropping packets which exceed the threshold. A classic
setup example is like so:
\begin{Verbatim}
# tc qdisc add dev eth0 handle ffff: ingress
# tc filter add dev eth0 parent ffff: u32 \
match u32 0 0
police rate 1mbit burst 100k
\end{Verbatim}
The ingress qdisc is not a real one, but merely a point of reference for filters
to attach to which should get applied to incoming traffic. The \filter{u32} filter added
above matches on any packet and therefore limits the total incoming bandwidth to
1mbit/s, allowing bursts of up to 100kbytes. Using the new syntax, the filter
command changes slightly:
\begin{Verbatim}
# tc filter add dev eth0 parent ffff: u32 \
match u32 0 0 \
action police rate 1mbit burst 100k
\end{Verbatim}
The important detail is that this syntax allows to define multiple actions.
E.g. for testing purposes, it is possible to redirect exceeding traffic to the
loopback interface instead of dropping it:
\begin{Verbatim}
# tc filter add dev eth0 parent ffff: u32 \
match u32 0 0 \
action police rate 1mbit burst 100k conform-exceed pipe \
action mirred egress redirect dev lo
\end{Verbatim}
The added parameter \texttt{conform-exceed pipe} tells the police action to allow for
further actions to handle the exceeding packet.
Apart from \texttt{police} and \texttt{mirred} actions, there are a few more. Here's a full
list of the currently implemented ones:
\begin{description}
\item[bpf]
Apply a Berkeley Packet Filter program to the packet.
\item[connmark]
Set the packet's firewall mark to that of it's connection. This works by
searching the conntrack table for a matching entry. If found, the mark
is restored.
\item[csum]
Trigger recalculation of packet checksums. The supported protocols are:
IPv4, ICMP, IGMP, TCP, UDP and UDPLite.
\item[ipt]
Pass the packet to an iptables target. This allows to use iptables
extensions directly instead of having to go the extra mile via setting
an arbitrary firewall mark and matching on that from within netfilter.
\item[mirred]
Mirror or redirect packets. This is often combined with the ifb pseudo
device to share a common QoS setup between multiple interfaces or even
ingress traffic.
\item[nat]
Perform stateless Native Address Translation. This is certainly not
complete and therefore inferior to NAT using iptables: Although the
kernel module decides between TCP, UDP and ICMP traffic, it does not
handle typical problematic protocols such as active FTP or SIP.
\item[pedit]
Generic packet editing. This allows to alter arbitrary bytes of the
packet, either by specifying an offset into the packet or by naming a
packet header and field name to change. Currently, the latter is
implemented only for IPv4 yet.
\item[police]
Apply a bandwidth rate limiting policy. Packets exceeding it are dropped
by default, but may optionally be handled differently.
\item[simple]
This is rather an example than real action. All it does is print a
user-defined string together with a packet counter. Useful maybe for
debugging when filter statistics are not available or too complicated.
\item[skbedit]
Edit associated packet data, supports changing queue mapping, priority
field and firewall mark value.
\item[vlan]
Add/remove a VLAN header to/from the packet. This might serve as
alternative to using 802.1Q pseudo-interfaces in combination with
routing rules when e.g. packets for a given destination need to be
encapsulated.
\end{description}
\section*{Intermediate Functional Block}
The Intermediate Functional Block (\texttt{ifb}) pseudo network interface acts as a QoS
concentrator for multiple different sources of traffic. Packets from or to other
interfaces have to be redirected to it using the \texttt{mirred} action in order to be
handled, regularly routed traffic will be dropped. This way, a single stack of
qdiscs, classes and filters can be shared between multiple interfaces.
Here's a simple example to feed incoming traffic from multiple interfaces
through a Stochastic Fairness Queue (\qdisc{sfq}):
\begin{Verbatim}
(1) # modprobe ifb
(2) # ip link set ifb0 up
(3) # tc qdisc add dev ifb0 root sfq
\end{Verbatim}
The first step is to load the \texttt{ifb} kernel module (1). By default, this will
create two ifb devices: \iface{ifb0} and \iface{ifb1}. After setting
\iface{ifb0} up in (2), the root
qdisc is replaced by \qdisc{sfq} in (3). Finally, one can start redirecting ingress
traffic to \iface{ifb0}, e.g. from \iface{eth0}:
\begin{Verbatim}
# tc qdisc add dev eth0 handle ffff: ingress
# tc filter add dev eth0 parent ffff: u32 \
match u32 0 0 \
action mirred egress redirect dev ifb0
\end{Verbatim}
The same can be done for other interfaces, just replacing \iface{eth0} in the two
commands above. One thing to keep in mind here is the asymmetrical routing this
creates within the host doing the QoS: Incoming packets enter the system via
\iface{ifb0}, while corresponding replies leave directly via \iface{eth0}. This can be observed
using \cmd{tcpdump} on \iface{ifb0}, which shows the input part of the traffic only. What's
more confusing is that \cmd{tcpdump} on \iface{eth0} shows both incoming and outgoing traffic,
but the redirection is still effective - a simple prove is setting
\iface{ifb0} down,
which will interrupt the communication. Obviously \cmd{tcpdump} catches the packets to
dump before they enter the ingress qdisc, which is why it sees them while the
kernel itself doesn't.
\section*{Conclusion}
Once the steep learning curve has been mastered, the conglomerate of (classful)
qdiscs, filters and actions provides a highly sophisticated and flexible
infrastructure to perform QoS, which plays nicely along with routing and
firewalling setups.
\section*{Further Reading}
A good starting point for novice users and experienced ones diving into unknown
areas is the extensive HOWTO at \url{http://lartc.org}. The iproute2 package ships
some examples (usually in /usr/share/doc/, depending on distribution) as well as
man pages for \cmd{tc} in general, qdiscs and filters. The latter have been added
just recently though, so if your distribution does not ship iproute2 version
4.3.0 yet, these are not in there. Apart from that, the internet is a spring of
HOWTOs and scripts people wrote - though these should be taken with a grain of
salt: The complexity of the matter often leads to copying others' solutions
without much validation, which allows for less optimal or even obsolete
implementations to survive much longer than desired.
\end{document}

View File

@ -29,13 +29,11 @@ enum output_type {
PRINT_ANY = 4,
};
void new_json_obj(int json, FILE *fp);
void new_json_obj(int json);
void delete_json_obj(void);
bool is_json_context(void);
void set_current_fp(FILE *fp);
void fflush_fp(void);
void open_json_object(const char *str);

View File

@ -1815,7 +1815,7 @@ static int ipaddr_showdump(void)
if (ipadd_dump_check_magic())
exit(-1);
new_json_obj(json, stdout);
new_json_obj(json);
open_json_object(NULL);
open_json_array(PRINT_JSON, "addr_info");
@ -2176,7 +2176,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action)
* Initialize a json_writer and open an array object
* if -json was specified.
*/
new_json_obj(json, stdout);
new_json_obj(json);
/*
* If only filter_dev present and none of the other

View File

@ -16,15 +16,14 @@
#include "json_print.h"
static json_writer_t *_jw;
static FILE *_fp;
#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw)
#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY))
void new_json_obj(int json, FILE *fp)
void new_json_obj(int json)
{
if (json) {
_jw = jsonw_new(fp);
_jw = jsonw_new(stdout);
if (!_jw) {
perror("json object");
exit(1);
@ -32,7 +31,6 @@ void new_json_obj(int json, FILE *fp)
jsonw_pretty(_jw, true);
jsonw_start_array(_jw);
}
set_current_fp(fp);
}
void delete_json_obj(void)
@ -48,15 +46,6 @@ bool is_json_context(void)
return _jw != NULL;
}
void set_current_fp(FILE *fp)
{
if (!fp) {
fprintf(stderr, "Error: invalid file pointer.\n");
exit(1);
}
_fp = fp;
}
json_writer_t *get_json_writer(void)
{
return _jw;
@ -89,7 +78,7 @@ void open_json_array(enum output_type type, const char *str)
jsonw_name(_jw, str);
jsonw_start_array(_jw);
} else if (_IS_FP_CONTEXT(type)) {
fprintf(_fp, "%s", str);
printf("%s", str);
}
}
@ -103,7 +92,7 @@ void close_json_array(enum output_type type, const char *str)
jsonw_end_array(_jw);
jsonw_pretty(_jw, true);
} else if (_IS_FP_CONTEXT(type)) {
fprintf(_fp, "%s", str);
printf("%s", str);
}
}
@ -124,7 +113,7 @@ void close_json_array(enum output_type type, const char *str)
else \
jsonw_##type_name##_field(_jw, key, value); \
} else if (_IS_FP_CONTEXT(t)) { \
color_fprintf(_fp, color, fmt, value); \
color_fprintf(stdout, color, fmt, value); \
} \
}
_PRINT_FUNC(int, int);
@ -147,7 +136,7 @@ void print_color_string(enum output_type type,
else
jsonw_string_field(_jw, key, value);
} else if (_IS_FP_CONTEXT(type)) {
color_fprintf(_fp, color, fmt, value);
color_fprintf(stdout, color, fmt, value);
}
}
@ -168,7 +157,7 @@ void print_color_bool(enum output_type type,
else
jsonw_bool(_jw, value);
} else if (_IS_FP_CONTEXT(type)) {
color_fprintf(_fp, color, fmt, value ? "true" : "false");
color_fprintf(stdout, color, fmt, value ? "true" : "false");
}
}
@ -187,7 +176,7 @@ void print_color_0xhex(enum output_type type,
snprintf(b1, sizeof(b1), "%#x", hex);
print_string(PRINT_JSON, key, NULL, b1);
} else if (_IS_FP_CONTEXT(type)) {
color_fprintf(_fp, color, fmt, hex);
color_fprintf(stdout, color, fmt, hex);
}
}
@ -206,7 +195,7 @@ void print_color_hex(enum output_type type,
else
jsonw_string(_jw, b1);
} else if (_IS_FP_CONTEXT(type)) {
color_fprintf(_fp, color, fmt, hex);
color_fprintf(stdout, color, fmt, hex);
}
}
@ -226,6 +215,6 @@ void print_color_null(enum output_type type,
else
jsonw_null(_jw);
} else if (_IS_FP_CONTEXT(type)) {
color_fprintf(_fp, color, fmt, value);
color_fprintf(stdout, color, fmt, value);
}
}