mirror of
https://git.proxmox.com/git/mirror_iproute2
synced 2026-01-24 16:42:01 +00:00
Merge branch 'master' into net-next
This commit is contained in:
commit
ee7bfb52a7
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
# locally generated
|
||||
Config
|
||||
static-syms.h
|
||||
config.*
|
||||
*.o
|
||||
|
||||
4
Makefile
4
Makefile
@ -73,7 +73,7 @@ install: all
|
||||
$(DESTDIR)$(DOCDIR)/examples
|
||||
install -m 0644 $(shell find examples/diffserv -maxdepth 1 -type f) \
|
||||
$(DESTDIR)$(DOCDIR)/examples/diffserv
|
||||
@for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done
|
||||
@for i in $(SUBDIRS); do $(MAKE) -C $$i install; done
|
||||
install -m 0644 $(shell find etc/iproute2 -maxdepth 1 -type f) $(DESTDIR)$(CONFDIR)
|
||||
install -m 0755 -d $(DESTDIR)$(BASH_COMPDIR)
|
||||
install -m 0644 bash-completion/tc $(DESTDIR)$(BASH_COMPDIR)
|
||||
@ -84,7 +84,7 @@ snapshot:
|
||||
> include/SNAPSHOT.h
|
||||
|
||||
clean:
|
||||
@for i in $(SUBDIRS) doc; \
|
||||
@for i in $(SUBDIRS); \
|
||||
do $(MAKE) $(MFLAGS) -C $$i clean; done
|
||||
|
||||
clobber:
|
||||
|
||||
73
doc/Makefile
73
doc/Makefile
@ -1,73 +0,0 @@
|
||||
PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps tc-filters.ps
|
||||
# tc-cref.ps
|
||||
# api-rtnl.tex api-pmtudisc.tex api-news.tex
|
||||
# iki-netdev.ps iki-neighdst.ps
|
||||
|
||||
|
||||
LATEX=latex
|
||||
DVIPS=dvips
|
||||
SGML2DVI=sgml2latex
|
||||
SGML2HTML=sgml2html -s 0
|
||||
LPR=lpr -Zsduplex
|
||||
SHELL=bash
|
||||
PAGESIZE=a4
|
||||
PAGESPERPAGE=2
|
||||
|
||||
HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml))
|
||||
DVIFILES=$(subst .ps,.dvi,$(PSFILES))
|
||||
PDFFILES=$(subst .ps,.pdf,$(PSFILES))
|
||||
|
||||
|
||||
all: pstwocol
|
||||
|
||||
pstwocol: $(PSFILES)
|
||||
|
||||
html: $(HTMLFILES)
|
||||
|
||||
dvi: $(DVIFILES)
|
||||
|
||||
pdf: $(PDFFILES)
|
||||
|
||||
print: $(PSFILES)
|
||||
$(LPR) $(PSFILES)
|
||||
|
||||
%.tex: %.sgml
|
||||
$(SGML2DVI) --output=tex $<
|
||||
|
||||
%.dvi: %.sgml
|
||||
$(SGML2DVI) --output=dvi $<
|
||||
|
||||
%.dvi: %.tex
|
||||
@set -e; pass=2; echo "Running LaTeX $<"; \
|
||||
while [ `$(LATEX) $< </dev/null 2>&1 | \
|
||||
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
|
||||
if [ $$pass -gt 3 ]; then \
|
||||
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
|
||||
fi; \
|
||||
echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
|
||||
done
|
||||
|
||||
%.pdf: %.tex
|
||||
@set -e; pass=2; echo "Running pdfLaTeX $<"; \
|
||||
while [ `pdflatex $< </dev/null 2>&1 | \
|
||||
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
|
||||
if [ $$pass -gt 3 ]; then \
|
||||
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
|
||||
fi; \
|
||||
echo "Re-running pdfLaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
|
||||
done
|
||||
#%.pdf: %.ps
|
||||
# ps2pdf $<
|
||||
|
||||
%.ps: %.dvi
|
||||
$(DVIPS) $< -o $@
|
||||
|
||||
%.html: %.sgml
|
||||
$(SGML2HTML) $<
|
||||
|
||||
install:
|
||||
install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR)
|
||||
install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR)
|
||||
|
||||
clean:
|
||||
rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html *.pdf
|
||||
16
doc/Plan
16
doc/Plan
@ -1,16 +0,0 @@
|
||||
Partially finished work.
|
||||
|
||||
1. User Reference manuals.
|
||||
1.1 IP Command reference (ip-cref.tex, published)
|
||||
1.2 TC Command reference (tc-cref.tex)
|
||||
1.3 IP tunnels (ip-tunnels.tex, published)
|
||||
|
||||
2. Linux-2.2 Networking API
|
||||
2.1 RTNETLINK (api-rtnl.tex)
|
||||
2.2 Path MTU Discovery (api-pmtudisc.tex)
|
||||
2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published)
|
||||
2.4 Miscellaneous extensions (api-misc.tex)
|
||||
|
||||
3. Linux-2.2 Networking Intra-Kernel Interfaces
|
||||
3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex)
|
||||
3.2 Neighbour cache and destination cache. (iki-neighdst.tex)
|
||||
@ -1 +0,0 @@
|
||||
\def\Draft{020116}
|
||||
@ -1,429 +0,0 @@
|
||||
\documentstyle[12pt,twoside]{article}
|
||||
\def\TITLE{IPv6 Flow Labels}
|
||||
\input preamble
|
||||
\begin{center}
|
||||
\Large\bf IPv6 Flow Labels in Linux-2.2.
|
||||
\end{center}
|
||||
|
||||
|
||||
\begin{center}
|
||||
{ \large Alexey~N.~Kuznetsov } \\
|
||||
\em Institute for Nuclear Research, Moscow \\
|
||||
\verb|kuznet@ms2.inr.ac.ru| \\
|
||||
\rm April 11, 1999
|
||||
\end{center}
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
\tableofcontents
|
||||
|
||||
\section{Introduction.}
|
||||
|
||||
Every IPv6 packet carries 28 bits of flow information. RFC2460 splits
|
||||
these bits to two fields: 8 bits of traffic class (or DS field, if you
|
||||
prefer this term) and 20 bits of flow label. Currently there exist
|
||||
no well-defined API to manage IPv6 flow information. In this document
|
||||
I describe an attempt to design the API for Linux-2.2 IPv6 stack.
|
||||
|
||||
\vskip 1mm
|
||||
|
||||
The API must solve the following tasks:
|
||||
|
||||
\begin{enumerate}
|
||||
|
||||
\item To allow user to set traffic class bits.
|
||||
|
||||
\item To allow user to read traffic class bits of received packets.
|
||||
This feature is not so useful as the first one, however it will be
|
||||
necessary f.e.\ to implement ECN [RFC2481] for datagram oriented services
|
||||
or to implement receiver side of SRP or another end-to-end protocol
|
||||
using traffic class bits.
|
||||
|
||||
\item To assign flow labels to packets sent by user.
|
||||
|
||||
\item To get flow labels of received packets. I do not know
|
||||
any applications of this feature, but it is possible that receiver will
|
||||
want to use flow labels to distinguish sub-flows.
|
||||
|
||||
\item To allocate flow labels in the way, compliant to RFC2460. Namely:
|
||||
|
||||
\begin{itemize}
|
||||
\item
|
||||
Flow labels must be uniformly distributed (pseudo-)random numbers,
|
||||
so that any subset of 20 bits can be used as hash key.
|
||||
|
||||
\item
|
||||
Flows with coinciding source address and flow label must have identical
|
||||
destination address and not-fragmentable extensions headers (i.e.\
|
||||
hop by hop options and all the headers up to and including routing header,
|
||||
if it is present.)
|
||||
|
||||
\begin{NB}
|
||||
There is a hole in specs: some hop-by-hop options can be
|
||||
defined only on per-packet base (f.e.\ jumbo payload option).
|
||||
Essentially, it means that such options cannot present in packets
|
||||
with flow labels.
|
||||
\end{NB}
|
||||
\begin{NB}
|
||||
NB notes here and below reflect only my personal opinion,
|
||||
they should be read with smile or should not be read at all :-).
|
||||
\end{NB}
|
||||
|
||||
|
||||
\item
|
||||
Flow labels have finite lifetime and source is not allowed to reuse
|
||||
flow label for another flow within the maximal lifetime has expired,
|
||||
so that intermediate nodes will be able to invalidate flow state before
|
||||
the label is taken over by another flow.
|
||||
Flow state, including lifetime, is propagated along datagram path
|
||||
by some application specific methods
|
||||
(f.e.\ in RSVP PATH messages or in some hop-by-hop option).
|
||||
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\end{enumerate}
|
||||
|
||||
\section{Sending/receiving flow information.}
|
||||
|
||||
\paragraph{Discussion.}
|
||||
\addcontentsline{toc}{subsection}{Discussion}
|
||||
It was proposed (Where? I do not remember any explicit statement)
|
||||
to solve the first four tasks using
|
||||
\verb|sin6_flowinfo| field added to \verb|struct| \verb|sockaddr_in6|
|
||||
(see RFC2553).
|
||||
|
||||
\begin{NB}
|
||||
This method is difficult to consider as reasonable, because it
|
||||
puts additional overhead to all the services, despite of only
|
||||
very small subset of them (none, to be more exact) really use it.
|
||||
It contradicts both to IETF spirit and the letter. Before RFC2553
|
||||
one justification existed, IPv6 address alignment left 4 byte
|
||||
hole in \verb|sockaddr_in6| in any case. Now it has no justification.
|
||||
\end{NB}
|
||||
|
||||
We have two problems with this method. The first one is common for all OSes:
|
||||
if \verb|recvmsg()| initializes \verb|sin6_flowinfo| to flow info
|
||||
of received packet, we loose one very important property of BSD socket API,
|
||||
namely, we are not allowed to use received address for reply directly
|
||||
and have to mangle it, even if we are not interested in flowinfo subtleties.
|
||||
|
||||
\begin{NB}
|
||||
RFC2553 adds new requirement: to clear \verb|sin6_flowinfo|.
|
||||
Certainly, it is not solution but rather attempt to force applications
|
||||
to make unnecessary work. Well, as usually, one mistake in design
|
||||
is followed by attempts to patch the hole and more mistakes...
|
||||
\end{NB}
|
||||
|
||||
Another problem is Linux specific. Historically Linux IPv6 did not
|
||||
initialize \verb|sin6_flowinfo| at all, so that, if kernel does not
|
||||
support flow labels, this field is not zero, but a random number.
|
||||
Some applications also did not take care about it.
|
||||
|
||||
\begin{NB}
|
||||
Following RFC2553 such applications can be considered as broken,
|
||||
but I still think that they are right: clearing all the address
|
||||
before filling known fields is robust but stupid solution.
|
||||
Useless wasting CPU cycles and
|
||||
memory bandwidth is not a good idea. Such patches are acceptable
|
||||
as temporary hacks, but not as standard of the future.
|
||||
\end{NB}
|
||||
|
||||
|
||||
\paragraph{Implementation.}
|
||||
\addcontentsline{toc}{subsection}{Implementation}
|
||||
By default Linux IPv6 does not read \verb|sin6_flowinfo| field
|
||||
assuming that common applications are not obliged to initialize it
|
||||
and are permitted to consider it as pure alignment padding.
|
||||
In order to tell kernel that application
|
||||
is aware of this field, it is necessary to set socket option
|
||||
\verb|IPV6_FLOWINFO_SEND|.
|
||||
|
||||
\begin{verbatim}
|
||||
int on = 1;
|
||||
setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO_SEND,
|
||||
(void*)&on, sizeof(on));
|
||||
\end{verbatim}
|
||||
|
||||
Linux kernel never fills \verb|sin6_flowinfo| field, when passing
|
||||
message to user space, though the kernels which support flow labels
|
||||
initialize it to zero. If user wants to get received flowinfo, he
|
||||
will set option \verb|IPV6_FLOWINFO| and after this he will receive
|
||||
flowinfo as ancillary data object of type \verb|IPV6_FLOWINFO|
|
||||
(cf.\ RFC2292).
|
||||
|
||||
\begin{verbatim}
|
||||
int on = 1;
|
||||
setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO, (void*)&on, sizeof(on));
|
||||
\end{verbatim}
|
||||
|
||||
Flowinfo received and latched by a connected TCP socket also may be fetched
|
||||
with \verb|getsockopt()| \verb|IPV6_PKTOPTIONS| together with
|
||||
another optional information.
|
||||
|
||||
Besides that, in the spirit of RFC2292 the option \verb|IPV6_FLOWINFO|
|
||||
may be used as alternative way to send flowinfo with \verb|sendmsg()| or
|
||||
to latch it with \verb|IPV6_PKTOPTIONS|.
|
||||
|
||||
\paragraph{Note about IPv6 options and destination address.}
|
||||
\addcontentsline{toc}{subsection}{IPv6 options and destination address}
|
||||
If \verb|sin6_flowinfo| does contain not zero flow label,
|
||||
destination address in \verb|sin6_addr| and non-fragmentable
|
||||
extension headers are ignored. Instead, kernel uses the values
|
||||
cached at flow setup (see below). However, for connected sockets
|
||||
kernel prefers the values set at connection time.
|
||||
|
||||
\paragraph{Example.}
|
||||
\addcontentsline{toc}{subsection}{Example}
|
||||
After setting socket option \verb|IPV6_FLOWINFO|
|
||||
flowlabel and DS field are received as ancillary data object
|
||||
of type \verb|IPV6_FLOWINFO| and level \verb|SOL_IPV6|.
|
||||
In the cases when it is convenient to use \verb|recvfrom(2)|,
|
||||
it is possible to replace library variant with your own one,
|
||||
sort of:
|
||||
|
||||
\begin{verbatim}
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in6.h>
|
||||
|
||||
size_t recvfrom(int fd, char *buf, size_t len, int flags,
|
||||
struct sockaddr *addr, int *addrlen)
|
||||
{
|
||||
size_t cc;
|
||||
char cbuf[128];
|
||||
struct cmsghdr *c;
|
||||
struct iovec iov = { buf, len };
|
||||
struct msghdr msg = { addr, *addrlen,
|
||||
&iov, 1,
|
||||
cbuf, sizeof(cbuf),
|
||||
0 };
|
||||
|
||||
cc = recvmsg(fd, &msg, flags);
|
||||
if (cc < 0)
|
||||
return cc;
|
||||
((struct sockaddr_in6*)addr)->sin6_flowinfo = 0;
|
||||
*addrlen = msg.msg_namelen;
|
||||
for (c=CMSG_FIRSTHDR(&msg); c; c = CMSG_NEXTHDR(&msg, c)) {
|
||||
if (c->cmsg_level != SOL_IPV6 ||
|
||||
c->cmsg_type != IPV6_FLOWINFO)
|
||||
continue;
|
||||
((struct sockaddr_in6*)addr)->sin6_flowinfo = *(__u32*)CMSG_DATA(c);
|
||||
}
|
||||
return cc;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
|
||||
|
||||
\section{Flow label management.}
|
||||
|
||||
\paragraph{Discussion.}
|
||||
\addcontentsline{toc}{subsection}{Discussion}
|
||||
Requirements of RFC2460 are pretty tough. Particularly, lifetimes
|
||||
longer than boot time require to store allocated labels at stable
|
||||
storage, so that the full implementation necessarily includes user space flow
|
||||
label manager. There are at least three different approaches:
|
||||
|
||||
\begin{enumerate}
|
||||
\item {\bf ``Cooperative''. } We could leave flow label allocation wholly
|
||||
to user space. When user needs label he requests manager directly. The approach
|
||||
is valid, but as any ``cooperative'' approach it suffers of security problems.
|
||||
|
||||
\begin{NB}
|
||||
One idea is to disallow not privileged user to allocate flow
|
||||
labels, but instead to pass the socket to manager via \verb|SCM_RIGHTS|
|
||||
control message, so that it will allocate label and assign it to socket
|
||||
itself. Hmm... the idea is interesting.
|
||||
\end{NB}
|
||||
|
||||
\item {\bf ``Indirect''.} Kernel redirects requests to user level daemon
|
||||
and does not install label until the daemon acknowledged the request.
|
||||
The approach is the most promising, it is especially pleasant to recognize
|
||||
parallel with IPsec API [RFC2367,Craig]. Actually, it may share API with
|
||||
IPsec.
|
||||
|
||||
\item {\bf ``Stupid''.} To allocate labels in kernel space. It is the simplest
|
||||
method, but it suffers of two serious flaws: the first,
|
||||
we cannot lease labels with lifetimes longer than boot time, the second,
|
||||
it is sensitive to DoS attacks. Kernel have to remember all the obsolete
|
||||
labels until their expiration and malicious user may fastly eat all the
|
||||
flow label space.
|
||||
|
||||
\end{enumerate}
|
||||
|
||||
Certainly, I choose the most ``stupid'' method. It is the cheapest one
|
||||
for implementor (i.e.\ me), and taking into account that flow labels
|
||||
still have no serious applications it is not useful to work on more
|
||||
advanced API, especially, taking into account that eventually we
|
||||
will get it for no fee together with IPsec.
|
||||
|
||||
|
||||
\paragraph{Implementation.}
|
||||
\addcontentsline{toc}{subsection}{Implementation}
|
||||
Socket option \verb|IPV6_FLOWLABEL_MGR| allows to
|
||||
request flow label manager to allocate new flow label, to reuse
|
||||
already allocated one or to delete old flow label.
|
||||
Its argument is \verb|struct| \verb|in6_flowlabel_req|:
|
||||
|
||||
\begin{verbatim}
|
||||
struct in6_flowlabel_req
|
||||
{
|
||||
struct in6_addr flr_dst;
|
||||
__u32 flr_label;
|
||||
__u8 flr_action;
|
||||
__u8 flr_share;
|
||||
__u16 flr_flags;
|
||||
__u16 flr_expires;
|
||||
__u16 flr_linger;
|
||||
__u32 __flr_reserved;
|
||||
/* Options in format of IPV6_PKTOPTIONS */
|
||||
};
|
||||
\end{verbatim}
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item \verb|dst| is IPv6 destination address associated with the label.
|
||||
|
||||
\item \verb|label| is flow label value in network byte order. If it is zero,
|
||||
kernel will allocate new pseudo-random number. Otherwise, kernel will try
|
||||
to lease flow label ordered by user. In this case, it is user task to provide
|
||||
necessary flow label randomness.
|
||||
|
||||
\item \verb|action| is requested operation. Currently, only three operations
|
||||
are defined:
|
||||
|
||||
\begin{verbatim}
|
||||
#define IPV6_FL_A_GET 0 /* Get flow label */
|
||||
#define IPV6_FL_A_PUT 1 /* Release flow label */
|
||||
#define IPV6_FL_A_RENEW 2 /* Update expire time */
|
||||
\end{verbatim}
|
||||
|
||||
\item \verb|flags| are optional modifiers. Currently
|
||||
only \verb|IPV6_FL_A_GET| has modifiers:
|
||||
|
||||
\begin{verbatim}
|
||||
#define IPV6_FL_F_CREATE 1 /* Allowed to create new label */
|
||||
#define IPV6_FL_F_EXCL 2 /* Do not create new label */
|
||||
\end{verbatim}
|
||||
|
||||
|
||||
\item \verb|share| defines who is allowed to reuse the same flow label.
|
||||
|
||||
\begin{verbatim}
|
||||
#define IPV6_FL_S_NONE 0 /* Not defined */
|
||||
#define IPV6_FL_S_EXCL 1 /* Label is private */
|
||||
#define IPV6_FL_S_PROCESS 2 /* May be reused by this process */
|
||||
#define IPV6_FL_S_USER 3 /* May be reused by this user */
|
||||
#define IPV6_FL_S_ANY 255 /* Anyone may reuse it */
|
||||
\end{verbatim}
|
||||
|
||||
\item \verb|linger| is time in seconds. After the last user releases flow
|
||||
label, it will not be reused with different destination and options at least
|
||||
during this time. If \verb|share| is not \verb|IPV6_FL_S_EXCL| the label
|
||||
still can be shared by another sockets. Current implementation does not allow
|
||||
unprivileged user to set linger longer than 60 sec.
|
||||
|
||||
\item \verb|expires| is time in seconds. Flow label will be kept at least
|
||||
for this time, but it will not be destroyed before user released it explicitly
|
||||
or closed all the sockets using it. Current implementation does not allow
|
||||
unprivileged user to set timeout longer than 60 sec. Proviledged applications
|
||||
MAY set longer lifetimes, but in this case they MUST save allocated
|
||||
labels at stable storage and restore them back after reboot before the first
|
||||
application allocates new flow.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
This structure is followed by optional extension headers associated
|
||||
with this flow label in format of \verb|IPV6_PKTOPTIONS|. Only
|
||||
\verb|IPV6_HOPOPTS|, \verb|IPV6_RTHDR| and, if \verb|IPV6_RTHDR| presents,
|
||||
\verb|IPV6_DSTOPTS| are allowed.
|
||||
|
||||
\paragraph{Example.}
|
||||
\addcontentsline{toc}{subsection}{Example}
|
||||
The function \verb|get_flow_label| allocates
|
||||
private flow label.
|
||||
|
||||
\begin{verbatim}
|
||||
int get_flow_label(int fd, struct sockaddr_in6 *dst, __u32 fl)
|
||||
{
|
||||
int on = 1;
|
||||
struct in6_flowlabel_req freq;
|
||||
|
||||
memset(&freq, 0, sizeof(freq));
|
||||
freq.flr_label = htonl(fl);
|
||||
freq.flr_action = IPV6_FL_A_GET;
|
||||
freq.flr_flags = IPV6_FL_F_CREATE | IPV6_FL_F_EXCL;
|
||||
freq.flr_share = IPV6_FL_S_EXCL;
|
||||
memcpy(&freq.flr_dst, &dst->sin6_addr, 16);
|
||||
if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
|
||||
&freq, sizeof(freq)) == -1) {
|
||||
perror ("can't lease flowlabel");
|
||||
return -1;
|
||||
}
|
||||
dst->sin6_flowinfo |= freq.flr_label;
|
||||
|
||||
if (setsockopt(fd, SOL_IPV6, IPV6_FLOWINFO_SEND,
|
||||
&on, sizeof(on)) == -1) {
|
||||
perror ("can't send flowinfo");
|
||||
|
||||
freq.flr_action = IPV6_FL_A_PUT;
|
||||
setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
|
||||
&freq, sizeof(freq));
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
\end{verbatim}
|
||||
|
||||
A bit more complicated example using routing header can be found
|
||||
in \verb|ping6| utility (\verb|iputils| package). Linux rsvpd backend
|
||||
contains an example of using operation \verb|IPV6_FL_A_RENEW|.
|
||||
|
||||
\paragraph{Listing flow labels.}
|
||||
\addcontentsline{toc}{subsection}{Listing flow labels}
|
||||
List of currently allocated
|
||||
flow labels may be read from \verb|/proc/net/ip6_flowlabel|.
|
||||
|
||||
\begin{verbatim}
|
||||
Label S Owner Users Linger Expires Dst Opt
|
||||
A1BE5 1 0 0 6 3 3ffe2400000000010a0020fffe71fb30 0
|
||||
\end{verbatim}
|
||||
|
||||
\begin{itemize}
|
||||
\item \verb|Label| is hexadecimal flow label value.
|
||||
\item \verb|S| is sharing style.
|
||||
\item \verb|Owner| is ID of creator, it is zero, pid or uid, depending on
|
||||
sharing style.
|
||||
\item \verb|Users| is number of applications using the label now.
|
||||
\item \verb|Linger| is \verb|linger| of this label in seconds.
|
||||
\item \verb|Expires| is time until expiration of the label in seconds. It may
|
||||
be negative, if the label is in use.
|
||||
\item \verb|Dst| is IPv6 destination address.
|
||||
\item \verb|Opt| is length of options, associated with the label. Option
|
||||
data are not accessible.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\paragraph{Flow labels and RSVP.}
|
||||
\addcontentsline{toc}{subsection}{Flow labels and RSVP}
|
||||
RSVP daemon supports IPv6 flow labels
|
||||
without any modifications to standard ISI RAPI. Sender must allocate
|
||||
flow label, fill corresponding sender template and submit it to local rsvp
|
||||
daemon. rsvpd will check the label and start to announce it in PATH
|
||||
messages. Rsvpd on sender node will renew the flow label, so that it will not
|
||||
be reused before path state expires and all the intermediate
|
||||
routers and receiver purge flow state.
|
||||
|
||||
\verb|rtap| utility is modified to parse flow labels. F.e.\ if user allocated
|
||||
flow label \verb|0xA1234|, he may write:
|
||||
|
||||
\begin{verbatim}
|
||||
RTAP> sender 3ffe:2400::1/FL0xA1234 <Tspec>
|
||||
\end{verbatim}
|
||||
|
||||
Receiver makes reservation with command:
|
||||
\begin{verbatim}
|
||||
RTAP> reserve ff 3ffe:2400::1/FL0xA1234 <Flowspec>
|
||||
\end{verbatim}
|
||||
|
||||
\end{document}
|
||||
130
doc/arpd.sgml
130
doc/arpd.sgml
@ -1,130 +0,0 @@
|
||||
<!doctype linuxdoc system>
|
||||
|
||||
<article>
|
||||
|
||||
<title>ARPD Daemon
|
||||
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
|
||||
<date>some_negative_number, 20 Sep 2001
|
||||
<abstract>
|
||||
<tt/arpd/ is daemon collecting gratuitous ARP information, saving
|
||||
it on local disk and feeding it to kernel on demand to avoid
|
||||
redundant broadcasting due to limited size of kernel ARP cache.
|
||||
</abstract>
|
||||
|
||||
|
||||
<p><bf/Description/
|
||||
|
||||
<p>The format of the command is:
|
||||
|
||||
<tscreen><verb>
|
||||
arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ]
|
||||
</verb></tscreen>
|
||||
|
||||
<p> <tt/OPTIONS/ are:
|
||||
|
||||
<itemize>
|
||||
|
||||
<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists
|
||||
of three columns: interface index, IP address and MAC address.
|
||||
Negative entries for dead hosts are also shown, in this case MAC address
|
||||
is replaced by word <tt/FAILED/ followed by colon and time when the fact
|
||||
that host is dead was proven the last time.
|
||||
|
||||
<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/
|
||||
in text format similar dumped by option <tt/-l/. Exit after load,
|
||||
probably listing resulting database, if option <tt/-l/ is also given.
|
||||
If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table.
|
||||
|
||||
<item><tt/-b DATABASE/ - location of database file. Default location is
|
||||
<tt>/var/lib/arpd/arpd.db</tt>.
|
||||
|
||||
<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but
|
||||
also send brodcast queries itself. <tt/NUMBER/ is number of such queries
|
||||
to make before destination is considered as dead. When <tt/arpd/ is started
|
||||
as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/
|
||||
or even with option <tt/-k/) without this option and still did not learn enough
|
||||
information, you can observe 1 second gaps in service. Not fatal, but
|
||||
not good.
|
||||
|
||||
<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes
|
||||
sense together with option <tt/-a/.
|
||||
|
||||
<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/
|
||||
suppresses further attempts to resolve for this period. It makes sense
|
||||
only together with option <tt/-k/. This timeout should not be too much
|
||||
longer than boot time of a typical host not supporting gratuitous ARP.
|
||||
Default value is 60 seconds.
|
||||
|
||||
<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/
|
||||
in packets per second. Default value is 1.
|
||||
|
||||
<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back.
|
||||
Default value is 3. Together with option <tt/-R/ this option allows
|
||||
to police broadcasting not to exceed <tt/B+R*T/ over any interval
|
||||
of time <tt/T/.
|
||||
|
||||
</itemize>
|
||||
|
||||
<p><tt/INTERFACE/ is name of networking inteface to watch.
|
||||
If no interfaces given, <tt/arpd/ monitors all the interfaces.
|
||||
In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters,
|
||||
it is supposed user does this himself after <tt/arpd/ is started.
|
||||
|
||||
|
||||
<p> Signals
|
||||
|
||||
<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted
|
||||
<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/.
|
||||
<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics
|
||||
to <tt/syslog/. Effect of another signals is undefined, they may corrupt
|
||||
database and leave <tt/sysctl/ parameters in an unpredictable state.
|
||||
|
||||
<p> Note
|
||||
|
||||
<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be
|
||||
compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list
|
||||
is not given on command line, variable <tt/app_solicit/
|
||||
on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>.
|
||||
If this is not made <tt/arpd/ still collects gratuitous ARP information
|
||||
in its database.
|
||||
|
||||
<p> Examples
|
||||
|
||||
<enum>
|
||||
<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing
|
||||
with kernel functionality:
|
||||
|
||||
<tscreen><verb>
|
||||
arpd -b /var/tmp/arpd.db
|
||||
</verb></tscreen>
|
||||
|
||||
<item> Look at result after some time:
|
||||
|
||||
<tscreen><verb>
|
||||
killall arpd
|
||||
arpd -l -b /var/tmp/arpd.db
|
||||
</verb></tscreen>
|
||||
|
||||
<item> To enable kernel helper, leaving leading role to kernel:
|
||||
|
||||
<tscreen><verb>
|
||||
arpd -b /var/tmp/arpd.db -a 1 eth0 eth1
|
||||
</verb></tscreen>
|
||||
|
||||
<item> Completely replace kernel resolution on interfaces <tt/eth0/
|
||||
and <tt/eth1/. In this case kernel still does unicast probing to
|
||||
validate entries, but all the broadcast activity is suppressed
|
||||
and made under authority of <tt/arpd/:
|
||||
|
||||
<tscreen><verb>
|
||||
arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1
|
||||
</verb></tscreen>
|
||||
|
||||
This is mode which <tt/arpd/ is supposed to work normally.
|
||||
It is not default just to prevent occasional enabling of too aggressive
|
||||
mode occasionally.
|
||||
|
||||
</enum>
|
||||
|
||||
</article>
|
||||
|
||||
16
doc/do-psnup
16
doc/do-psnup
@ -1,16 +0,0 @@
|
||||
#! /bin/bash
|
||||
# $1 = Temporary file . "string"
|
||||
# $2 = File to process . "string"
|
||||
# $3 = Page size . ie: a4 , letter ... "string"
|
||||
# $4 = Number of pages to fit on a single sheet . "numeric"
|
||||
|
||||
if type psnup >&/dev/null; then
|
||||
echo "psnup -$4 -p$3 $1 $2"
|
||||
psnup -$4 -p$3 $1 $2
|
||||
elif type psmulti >&/dev/null; then
|
||||
echo "psmulti $1 > $2"
|
||||
psmulti $1 > $2
|
||||
else
|
||||
echo "cp $1 $2"
|
||||
cp $1 $2
|
||||
fi
|
||||
3453
doc/ip-cref.tex
3453
doc/ip-cref.tex
File diff suppressed because it is too large
Load Diff
@ -1,469 +0,0 @@
|
||||
\documentstyle[12pt,twoside]{article}
|
||||
\def\TITLE{Tunnels over IP}
|
||||
\input preamble
|
||||
\begin{center}
|
||||
\Large\bf Tunnels over IP in Linux-2.2
|
||||
\end{center}
|
||||
|
||||
|
||||
\begin{center}
|
||||
{ \large Alexey~N.~Kuznetsov } \\
|
||||
\em Institute for Nuclear Research, Moscow \\
|
||||
\verb|kuznet@ms2.inr.ac.ru| \\
|
||||
\rm March 17, 1999
|
||||
\end{center}
|
||||
|
||||
\vspace{5mm}
|
||||
|
||||
\tableofcontents
|
||||
|
||||
|
||||
\section{Instead of introduction: micro-FAQ.}
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item
|
||||
Q: In linux-2.0.36 I used:
|
||||
\begin{verbatim}
|
||||
ifconfig tunl1 10.0.0.1 pointopoint 193.233.7.65
|
||||
\end{verbatim}
|
||||
to create tunnel. It does not work in 2.2.0!
|
||||
|
||||
A: You are right, it does not work. The command written above is split to two commands.
|
||||
\begin{verbatim}
|
||||
ip tunnel add MY-TUNNEL mode ipip remote 193.233.7.65
|
||||
\end{verbatim}
|
||||
will create tunnel device with name \verb|MY-TUNNEL|. Now you may configure
|
||||
it with:
|
||||
\begin{verbatim}
|
||||
ifconfig MY-TUNNEL 10.0.0.1
|
||||
\end{verbatim}
|
||||
Certainly, if you prefer name \verb|tunl1| to \verb|MY-TUNNEL|,
|
||||
you still may use it.
|
||||
|
||||
\item
|
||||
Q: In linux-2.0.36 I used:
|
||||
\begin{verbatim}
|
||||
ifconfig tunl0 10.0.0.1
|
||||
route add -net 10.0.0.0 gw 193.233.7.65 dev tunl0
|
||||
\end{verbatim}
|
||||
to tunnel net 10.0.0.0 via router 193.233.7.65. It does not
|
||||
work in 2.2.0! Moreover, \verb|route| prints a funny error sort of
|
||||
``network unreachable'' and after this I found a strange direct route
|
||||
to 10.0.0.0 via \verb|tunl0| in routing table.
|
||||
|
||||
A: Yes, in 2.2 the rule that {\em normal} gateway must reside on directly
|
||||
connected network has not any exceptions. You may tell kernel, that
|
||||
this particular route is {\em abnormal}:
|
||||
\begin{verbatim}
|
||||
ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
|
||||
ip route add 10.0.0.0/8 via 193.233.7.65 dev tunl0 onlink
|
||||
\end{verbatim}
|
||||
Note keyword \verb|onlink|, it is the magic key that orders kernel
|
||||
not to check for consistency of gateway address.
|
||||
Probably, after this explanation you have already guessed another method
|
||||
to cheat kernel:
|
||||
\begin{verbatim}
|
||||
ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
|
||||
route add -host 193.233.7.65 dev tunl0
|
||||
route add -net 10.0.0.0 netmask 255.0.0.0 gw 193.233.7.65
|
||||
route del -host 193.233.7.65 dev tunl0
|
||||
\end{verbatim}
|
||||
Well, if you like such tricks, nobody may prohibit you to use them.
|
||||
Only do not forget
|
||||
that between \verb|route add| and \verb|route del| host 193.233.7.65 is
|
||||
unreachable.
|
||||
|
||||
\item
|
||||
Q: In 2.0.36 I used to load \verb|tunnel| device module and \verb|ipip| module.
|
||||
I cannot find any \verb|tunnel| in 2.2!
|
||||
|
||||
A: Linux-2.2 has single module \verb|ipip| for both directions of tunneling
|
||||
and for all IPIP tunnel devices.
|
||||
|
||||
\item
|
||||
Q: \verb|traceroute| does not work over tunnel! Well, stop... It works,
|
||||
only skips some number of hops.
|
||||
|
||||
A: Yes. By default tunnel driver copies \verb|ttl| value from
|
||||
inner packet to outer one. It means that path traversed by tunneled
|
||||
packets to another endpoint is not hidden. If you dislike this, or if you
|
||||
are going to use some routing protocol expecting that packets
|
||||
with ttl 1 will reach peering host (f.e.\ RIP, OSPF or EBGP)
|
||||
and you are not afraid of
|
||||
tunnel loops, you may append option \verb|ttl 64|, when creating tunnel
|
||||
with \verb|ip tunnel add|.
|
||||
|
||||
\item
|
||||
Q: ... Well, list of things, which 2.0 was able to do finishes.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\paragraph{Summary of differences between 2.2 and 2.0.}
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item {\bf In 2.0} you could compile tunnel device into kernel
|
||||
and got set of 4 devices \verb|tunl0| ... \verb|tunl3| or,
|
||||
alternatively, compile it as module and load new module
|
||||
for each new tunnel. Also, module \verb|ipip| was necessary
|
||||
to receive tunneled packets.
|
||||
|
||||
{\bf 2.2} has {\em one\/} module \verb|ipip|. Loading it you get base
|
||||
tunnel device \verb|tunl0| and another tunnels may be created with command
|
||||
\verb|ip tunnel add|. These new devices may have arbitrary names.
|
||||
|
||||
|
||||
\item {\bf In 2.0} you set remote tunnel endpoint address with
|
||||
the command \verb|ifconfig| ... \verb|pointopoint A|.
|
||||
|
||||
{\bf In 2.2} this command has the same semantics on all
|
||||
the interfaces, namely it sets not tunnel endpoint,
|
||||
but address of peering host, which is directly reachable
|
||||
via this tunnel,
|
||||
rather than via Internet. Actual tunnel endpoint address \verb|A|
|
||||
should be set with \verb|ip tunnel add ... remote A|.
|
||||
|
||||
\item {\bf In 2.0} you create tunnel routes with the command:
|
||||
\begin{verbatim}
|
||||
route add -net 10.0.0.0 gw A dev tunl0
|
||||
\end{verbatim}
|
||||
|
||||
{\bf 2.2} interprets this command equally for all device
|
||||
kinds and gateway is required to be directly reachable via this tunnel,
|
||||
rather than via Internet. You still may use \verb|ip route add ... onlink|
|
||||
to override this behaviour.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\section{Tunnel setup: basics}
|
||||
|
||||
Standard Linux-2.2 kernel supports three flavor of tunnels,
|
||||
listed in the following table:
|
||||
\vspace{2mm}
|
||||
|
||||
\begin{tabular}{lll}
|
||||
\vrule depth 0.8ex width 0pt\relax
|
||||
Mode & Description & Base device \\
|
||||
ipip & IP over IP & tunl0 \\
|
||||
sit & IPv6 over IP & sit0 \\
|
||||
gre & ANY over GRE over IP & gre0
|
||||
\end{tabular}
|
||||
|
||||
\vspace{2mm}
|
||||
|
||||
\noindent All the kinds of tunnels are created with one command:
|
||||
\begin{verbatim}
|
||||
ip tunnel add <NAME> mode <MODE> [ local <S> ] [ remote <D> ]
|
||||
\end{verbatim}
|
||||
|
||||
This command creates new tunnel device with name \verb|<NAME>|.
|
||||
The \verb|<NAME>| is an arbitrary string. Particularly,
|
||||
it may be even \verb|eth0|. The rest of parameters set
|
||||
different tunnel characteristics.
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item
|
||||
\verb|mode <MODE>| sets tunnel mode. Three modes are available now
|
||||
\verb|ipip|, \verb|sit| and \verb|gre|.
|
||||
|
||||
\item
|
||||
\verb|remote <D>| sets remote endpoint of the tunnel to IP
|
||||
address \verb|<D>|.
|
||||
\item
|
||||
\verb|local <S>| sets fixed local address for tunneled
|
||||
packets. It must be an address on another interface of this host.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\let\thefootnote\oldthefootnote
|
||||
|
||||
Both \verb|remote| and \verb|local| may be omitted. In this case we
|
||||
say that they are zero or wildcard. Two tunnels of one mode cannot
|
||||
have the same \verb|remote| and \verb|local|. Particularly it means
|
||||
that base device or fallback tunnel cannot be replicated.\footnote{
|
||||
This restriction is relaxed for keyed GRE tunnels.}
|
||||
|
||||
Tunnels are divided to two classes: {\bf pointopoint} tunnels, which
|
||||
have some not wildcard \verb|remote| address and deliver all the packets
|
||||
to this destination, and {\bf NBMA} (i.e. Non-Broadcast Multi-Access) tunnels,
|
||||
which have no \verb|remote|. Particularly, base devices (f.e.\ \verb|tunl0|)
|
||||
are NBMA, because they have neither \verb|remote| nor
|
||||
\verb|local| addresses.
|
||||
|
||||
|
||||
After tunnel device is created you should configure it as you did
|
||||
it with another devices. Certainly, the configuration of tunnels has
|
||||
some features related to the fact that they work over existing Internet
|
||||
routing infrastructure and simultaneously create new virtual links,
|
||||
which changes this infrastructure. The danger that not enough careful
|
||||
tunnel setup will result in formation of tunnel loops,
|
||||
collapse of routing or flooding network with exponentially
|
||||
growing number of tunneled fragments is very real.
|
||||
|
||||
|
||||
Protocol setup on pointopoint tunnels does not differ of configuration
|
||||
of another devices. You should set a protocol address with \verb|ifconfig|
|
||||
and add routes with \verb|route| utility.
|
||||
|
||||
NBMA tunnels are different. To route something via NBMA tunnel
|
||||
you have to explain to driver, where it should deliver packets to.
|
||||
The only way to make it is to create special routes with gateway
|
||||
address pointing to desired endpoint. F.e.\
|
||||
\begin{verbatim}
|
||||
ip route add 10.0.0.0/24 via <A> dev tunl0 onlink
|
||||
\end{verbatim}
|
||||
It is important to use option \verb|onlink|, otherwise
|
||||
kernel will refuse request to create route via gateway not directly
|
||||
reachable over device \verb|tunl0|. With IPv6 the situation is much simpler:
|
||||
when you start device \verb|sit0|, it automatically configures itself
|
||||
with all IPv4 addresses mapped to IPv6 space, so that all IPv4
|
||||
Internet is {\em really reachable} via \verb|sit0|! Excellent, the command
|
||||
\begin{verbatim}
|
||||
ip route add 3FFE::/16 via ::193.233.7.65 dev sit0
|
||||
\end{verbatim}
|
||||
will route \verb|3FFE::/16| via \verb|sit0|, sending all the packets
|
||||
destined to this prefix to 193.233.7.65.
|
||||
|
||||
\section{Tunnel setup: options}
|
||||
|
||||
Command \verb|ip tunnel add| has several additional options.
|
||||
\begin{itemize}
|
||||
|
||||
\item \verb|ttl N| --- set fixed TTL \verb|N| on tunneled packets.
|
||||
\verb|N| is number in the range 1--255. 0 is special value,
|
||||
meaning that packets inherit TTL value.
|
||||
Default value is: \verb|inherit|.
|
||||
|
||||
\item \verb|tos T| --- set fixed tos \verb|T| on tunneled packets.
|
||||
Default value is: \verb|inherit|.
|
||||
|
||||
\item \verb|dev DEV| --- bind tunnel to device \verb|DEV|, so that
|
||||
tunneled packets will be routed only via this device and will
|
||||
not be able to escape to another device, when route to endpoint changes.
|
||||
|
||||
\item \verb|nopmtudisc| --- disable Path MTU Discovery on this tunnel.
|
||||
It is enabled by default. Note that fixed ttl is incompatible
|
||||
with this option: tunnels with fixed ttl always make pmtu discovery.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\verb|ipip| and \verb|sit| tunnels have no more options. \verb|gre|
|
||||
tunnels are more complicated:
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item \verb|key K| --- use keyed GRE with key \verb|K|. \verb|K| is
|
||||
either number or IP address-like dotted quad.
|
||||
|
||||
\item \verb|csum| --- checksum tunneled packets.
|
||||
|
||||
\item \verb|seq| --- serialize packets.
|
||||
\begin{NB}
|
||||
I think this option does not
|
||||
work. At least, I did not test it, did not debug it and
|
||||
even do not understand, how it is supposed to work and for what
|
||||
purpose Cisco planned to use it.
|
||||
\end{NB}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
|
||||
Actually, these GRE options can be set separately for input and
|
||||
output directions by prefixing corresponding keywords with letter
|
||||
\verb|i| or \verb|o|. F.e.\ \verb|icsum| orders to accept only
|
||||
packets with correct checksum and \verb|ocsum| means, that
|
||||
our host will calculate and send checksum.
|
||||
|
||||
Command \verb|ip tunnel add| is not the only operation,
|
||||
which can be made with tunnels. Certainly, you may get short help page
|
||||
with:
|
||||
\begin{verbatim}
|
||||
ip tunnel help
|
||||
\end{verbatim}
|
||||
|
||||
Besides that, you may view list of installed tunnels with the help of command:
|
||||
\begin{verbatim}
|
||||
ip tunnel ls
|
||||
\end{verbatim}
|
||||
Also you may look at statistics:
|
||||
\begin{verbatim}
|
||||
ip -s tunnel ls Cisco
|
||||
\end{verbatim}
|
||||
where \verb|Cisco| is name of tunnel device. Command
|
||||
\begin{verbatim}
|
||||
ip tunnel del Cisco
|
||||
\end{verbatim}
|
||||
destroys tunnel \verb|Cisco|. And, finally,
|
||||
\begin{verbatim}
|
||||
ip tunnel change Cisco mode sit local ME remote HE ttl 32
|
||||
\end{verbatim}
|
||||
changes its parameters.
|
||||
|
||||
\section{Differences 2.2 and 2.0 tunnels revisited.}
|
||||
|
||||
Now we can discuss more subtle differences between tunneling in 2.0
|
||||
and 2.2.
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item In 2.0 all tunneled packets were received promiscuously
|
||||
as soon as you loaded module \verb|ipip|. 2.2 tries to select the best
|
||||
tunnel device and packet looks as received on this. F.e.\ if host
|
||||
received \verb|ipip| packet from host \verb|D| destined to our
|
||||
local address \verb|S|, kernel searches for matching tunnels
|
||||
in order:
|
||||
|
||||
\begin{tabular}{ll}
|
||||
1 & \verb|remote| is \verb|D| and \verb|local| is \verb|S| \\
|
||||
2 & \verb|remote| is \verb|D| and \verb|local| is wildcard \\
|
||||
3 & \verb|remote| is wildcard and \verb|local| is \verb|S| \\
|
||||
4 & \verb|tunl0|
|
||||
\end{tabular}
|
||||
|
||||
If tunnel exists, but it is not in \verb|UP| state, the tunnel is ignored.
|
||||
Note, that if \verb|tunl0| is \verb|UP| it receives all the IPIP packets,
|
||||
not acknowledged by more specific tunnels.
|
||||
Be careful, it means that without carefully installed firewall rules
|
||||
anyone on the Internet may inject to your network any packets with
|
||||
source addresses indistinguishable from local ones. It is not so bad idea
|
||||
to design tunnels in the way enforcing maximal route symmetry
|
||||
and to enable reversed path filter (\verb|rp_filter| sysctl option) on
|
||||
tunnel devices.
|
||||
|
||||
\item In 2.2 you can monitor and debug tunnels with \verb|tcpdump|.
|
||||
F.e.\ \verb|tcpdump| \verb|-i Cisco| \verb|-nvv| will dump packets,
|
||||
which kernel output, via tunnel \verb|Cisco| and the packets received on it
|
||||
from kernel viewpoint.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\section{Linux and Cisco IOS tunnels.}
|
||||
|
||||
Among another tunnels Cisco IOS supports IPIP and GRE.
|
||||
Essentially, Cisco setup is subset of options, available for Linux.
|
||||
Let us consider the simplest example:
|
||||
|
||||
\begin{verbatim}
|
||||
interface Tunnel0
|
||||
tunnel mode gre ip
|
||||
tunnel source 10.10.14.1
|
||||
tunnel destination 10.10.13.2
|
||||
\end{verbatim}
|
||||
|
||||
|
||||
This command set translates to:
|
||||
|
||||
\begin{verbatim}
|
||||
ip tunnel add Tunnel0 \
|
||||
mode gre \
|
||||
local 10.10.14.1 \
|
||||
remote 10.10.13.2
|
||||
\end{verbatim}
|
||||
|
||||
Any questions? No questions.
|
||||
|
||||
\section{Interaction IPIP tunnels and DVMRP.}
|
||||
|
||||
DVMRP exploits IPIP tunnels to route multicasts via Internet.
|
||||
\verb|mrouted| creates
|
||||
IPIP tunnels listed in its configuration file automatically.
|
||||
From kernel and user viewpoints there are no differences between
|
||||
tunnels, created in this way, and tunnels created by \verb|ip tunnel|.
|
||||
I.e.\ if \verb|mrouted| created some tunnel, it may be used to
|
||||
route unicast packets, provided appropriate routes are added.
|
||||
And vice versa, if administrator has already created a tunnel,
|
||||
it will be reused by \verb|mrouted|, if it requests DVMRP
|
||||
tunnel with the same local and remote addresses.
|
||||
|
||||
Do not wonder, if your manually configured tunnel is
|
||||
destroyed, when mrouted exits.
|
||||
|
||||
|
||||
\section{Broadcast GRE ``tunnels''.}
|
||||
|
||||
It is possible to set \verb|remote| for GRE tunnel to a multicast
|
||||
address. Such tunnel becomes {\bf broadcast} tunnel (though word
|
||||
tunnel is not quite appropriate in this case, it is rather virtual network).
|
||||
\begin{verbatim}
|
||||
ip tunnel add Universe local 193.233.7.65 \
|
||||
remote 224.66.66.66 ttl 16
|
||||
ip addr add 10.0.0.1/16 dev Universe
|
||||
ip link set Universe up
|
||||
\end{verbatim}
|
||||
This tunnel is true broadcast network and broadcast packets are
|
||||
sent to multicast group 224.66.66.66. By default such tunnel starts
|
||||
to resolve both IP and IPv6 addresses via ARP/NDISC, so that
|
||||
if multicast routing is supported in surrounding network, all GRE nodes
|
||||
will find one another automatically and will form virtual Ethernet-like
|
||||
broadcast network. If multicast routing does not work, it is unpleasant
|
||||
but not fatal flaw. The tunnel becomes NBMA rather than broadcast network.
|
||||
You may disable dynamic ARPing by:
|
||||
\begin{verbatim}
|
||||
echo 0 > /proc/sys/net/ipv4/neigh/Universe/mcast_solicit
|
||||
\end{verbatim}
|
||||
and to add required information to ARP tables manually:
|
||||
\begin{verbatim}
|
||||
ip neigh add 10.0.0.2 lladdr 128.6.190.2 dev Universe nud permanent
|
||||
\end{verbatim}
|
||||
In this case packets sent to 10.0.0.2 will be encapsulated in GRE
|
||||
and sent to 128.6.190.2. It is possible to facilitate address resolution
|
||||
using methods typical for another NBMA networks f.e.\ to start user
|
||||
level \verb|arpd| daemon, which will maintain database of hosts attached
|
||||
to GRE virtual network or ask for information
|
||||
dedicated ARP or NHRP server.
|
||||
|
||||
|
||||
Actually, such setup is the most natural for tunneling,
|
||||
it is really flexible, scalable and easily managable, so that
|
||||
it is strongly recommended to be used with GRE tunnels instead of ugly
|
||||
hack with NBMA mode and \verb|onlink| modifier. Unfortunately,
|
||||
by historical reasons broadcast mode is not supported by IPIP tunnels,
|
||||
but this probably will change in future.
|
||||
|
||||
|
||||
|
||||
\section{Traffic control issues.}
|
||||
|
||||
Tunnels are devices, hence all the power of Linux traffic control
|
||||
applies to them. The simplest (and the most useful in practice)
|
||||
example is limiting tunnel bandwidth. The following command:
|
||||
\begin{verbatim}
|
||||
tc qdisc add dev tunl0 root tbf \
|
||||
rate 128Kbit burst 4K limit 10K
|
||||
\end{verbatim}
|
||||
will limit tunneled traffic to 128Kbit with maximal burst size of 4K
|
||||
and queuing not more than 10K.
|
||||
|
||||
However, you should remember, that tunnels are {\em virtual} devices
|
||||
implemented in software and true queue management is impossible for them
|
||||
just because they have no queues. Instead, it is better to create classes
|
||||
on real physical interfaces and to map tunneled packets to them.
|
||||
In general case of dynamic routing you should create such classes
|
||||
on all outgoing interfaces, or, alternatively,
|
||||
to use option \verb|dev DEV| to bind tunnel to a fixed physical device.
|
||||
In the last case packets will be routed only via specified device
|
||||
and you need to setup corresponding classes only on it.
|
||||
Though you have to pay for this convenience,
|
||||
if routing will change, your tunnel will fail.
|
||||
|
||||
Suppose that CBQ class \verb|1:ABC| has been created on device \verb|eth0|
|
||||
specially for tunnel \verb|Cisco| with endpoints \verb|S| and \verb|D|.
|
||||
Now you can select IPIP packets with addresses \verb|S| and \verb|D|
|
||||
with some classifier and map them to class \verb|1:ABC|. F.e.\
|
||||
it is easy to make with \verb|rsvp| classifier:
|
||||
\begin{verbatim}
|
||||
tc filter add dev eth0 pref 100 proto ip rsvp \
|
||||
session D ipproto ipip filter S \
|
||||
classid 1:ABC
|
||||
\end{verbatim}
|
||||
|
||||
If you want to make more detailed classification of sub-flows
|
||||
transmitted via tunnel, you can build CBQ subtree,
|
||||
rooted at \verb|1:ABC| and attach to subroot set of rules parsing
|
||||
IPIP packets more deeply.
|
||||
|
||||
\end{document}
|
||||
110
doc/nstat.sgml
110
doc/nstat.sgml
@ -1,110 +0,0 @@
|
||||
<!doctype linuxdoc system>
|
||||
|
||||
<article>
|
||||
|
||||
<title>NSTAT, IFSTAT and RTACCT Utilities
|
||||
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
|
||||
<date>some_negative_number, 20 Sep 2001
|
||||
<abstract>
|
||||
<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping
|
||||
to monitor kernel snmp counters and network interface statistics.
|
||||
</abstract>
|
||||
|
||||
<p> These utilities are very similar, so that I describe
|
||||
them simultaneously, using name <tt/Xstat/ in the places which apply
|
||||
to all of them.
|
||||
|
||||
<p>The format of the command is:
|
||||
|
||||
<tscreen><verb>
|
||||
Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ]
|
||||
</verb></tscreen>
|
||||
|
||||
<p>
|
||||
<tt/PATTERN/ is shell style pattern, selecting identifier
|
||||
of SNMP variables or interfaces to show. Variable is displayed
|
||||
if one of patterns matches its name. If no patterns are given,
|
||||
<tt/Xstat/ assumes that user wants to see all the variables.
|
||||
|
||||
<p> <tt/OPTIONS/ is list of single letter options, using common unix
|
||||
conventions.
|
||||
|
||||
<itemize>
|
||||
<item><tt/-h/ - show help page
|
||||
<item><tt/-?/ - the same, of course
|
||||
<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit
|
||||
<item><tt/-z/ - dump zero counters too. By default they are not shown.
|
||||
<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/
|
||||
calculates increments since the previous use.
|
||||
<item><tt/-s/ - do not update history, so that the next time you will
|
||||
see counters including values accumulated to the moment
|
||||
of this measurement too.
|
||||
<item><tt/-n/ - do not display anything, only update history.
|
||||
<item><tt/-r/ - reset history.
|
||||
<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting
|
||||
statistics. <tt/INTERVAL/ is interval between measurements
|
||||
in seconds.
|
||||
<item><tt/-t INTERVAL/ - time interval to average rates. Default value
|
||||
is 60 seconds.
|
||||
<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only).
|
||||
</itemize>
|
||||
|
||||
<p>
|
||||
History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt>
|
||||
or in file given by environment variables <tt/NSTAT_HISTORY/,
|
||||
<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/.
|
||||
Each time when you use <tt/Xstat/ values there are updated.
|
||||
If you use patterns, only the values which you _really_ see
|
||||
are updated. If you want to skip an unintersting period,
|
||||
use option <tt/-n/, or just output to <tt>/dev/null</tt>.
|
||||
|
||||
<p>
|
||||
<tt/Xstat/ understands when history is invalidated by system reboot
|
||||
or source of information switched between different instances
|
||||
of daemonic <tt/Xstat/ and kernel SNMP tables and does not
|
||||
use invalid history.
|
||||
|
||||
<p> Beware, <tt/Xstat/ will not produce sane output,
|
||||
when many processes use it simultaneously. If several processes
|
||||
under single user need this utility they should use environment
|
||||
variables to put their history in safe places
|
||||
or to use it with options <tt/-a -s/.
|
||||
|
||||
<p>
|
||||
Well, that's all. The utility is very simple, but nevertheless
|
||||
very handy.
|
||||
|
||||
<p> <bf/Output of XSTAT/
|
||||
<p> The first line of output is <tt/#/ followed by identifier
|
||||
of source of information, it may be word <tt/kernel/, when <tt/Xstat/
|
||||
gets information from kernel or some dotted decimal number followed
|
||||
by parameters, when it obtains information from running <tt/Xstat/ daemon.
|
||||
|
||||
<p>In the case of <tt/nstat/ the rest of output consists of three columns:
|
||||
SNMP MIB identifier,
|
||||
its value (or increment since previous measurement) and average
|
||||
rate of increase of the counter per second. <tt/ifstat/ outputs
|
||||
interface name followed by pairs of counter and rate of its change.
|
||||
|
||||
<p> <bf/Daemonic Xstat/
|
||||
<p> <tt/Xstat/ may be started as daemon by any user. This makes sense
|
||||
to avoid wrapped counters and to obtain reasonable long counters
|
||||
for large time. Also <tt/Xstat/ daemon calculates average rates.
|
||||
For the first goal sampling interval (option <tt/-d/) may be large enough,
|
||||
f.e. for gigabit rates byte counters overflow not more frequently than
|
||||
each 40 seconds and you may select interval of 20 seconds.
|
||||
From the other hand, when <tt/Xstat/ is used for estimating rates
|
||||
interval should be less than averaging period (option <tt/-t/), otherwise
|
||||
estimation loses in quality.
|
||||
|
||||
Client <tt/Xstat/, before trying to get information from the kernel,
|
||||
contacts daemon started by this user, then it tries system wide
|
||||
daemon, which is supposed to be started by superuser. And only if
|
||||
none of them replied it gets information from kernel.
|
||||
|
||||
<p> <bf/Environment/
|
||||
<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/.
|
||||
<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/.
|
||||
<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/.
|
||||
|
||||
</article>
|
||||
@ -1,26 +0,0 @@
|
||||
\textwidth 6.0in
|
||||
\textheight 8.5in
|
||||
|
||||
\input SNAPSHOT
|
||||
|
||||
\pagestyle{myheadings}
|
||||
\markboth{\protect\TITLE}{}
|
||||
\markright{{\protect\sc iproute2-ss\Draft}}
|
||||
|
||||
% To print it in compact form: both sides on one sheet (psnup -2)
|
||||
\evensidemargin=\oddsidemargin
|
||||
|
||||
\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB.
|
||||
}{\par\egroup \vskip 1mm}
|
||||
|
||||
\def\threeonly{[2.3.15+ only] }
|
||||
|
||||
\begin{document}
|
||||
|
||||
\makeatletter
|
||||
\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}}
|
||||
\makeatother
|
||||
\let\oldthefootnote\thefootnote
|
||||
\def\thefootnote{}
|
||||
\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov}
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
<!doctype linuxdoc system>
|
||||
|
||||
<article>
|
||||
|
||||
<title>RTACCT Utility
|
||||
<author>Robert Olsson
|
||||
<date>some_negative_number, 20 Dec 2001
|
||||
|
||||
<p>
|
||||
Here is some code for monitoring the route cache. For systems handling high
|
||||
network load, servers, routers, firewalls etc the route cache and its garbage
|
||||
collection is crucial. Linux has a solid implementation.
|
||||
|
||||
<p>
|
||||
The kernel patch (not required since linux-2.4.7) adds statistics counters
|
||||
from route cache process into
|
||||
/proc/net/rt_cache_stat. A companion user mode program presents the statistics
|
||||
in a vmstat or iostat manner. The ratio between cache hits and misses gives
|
||||
the flow length.
|
||||
|
||||
<p>
|
||||
Hopefully it can help understanding performance and DoS and other related
|
||||
issues.
|
||||
|
||||
<p> An URL where newer versions of this utility can be (probably) found
|
||||
is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/
|
||||
|
||||
|
||||
<p><bf/Description/
|
||||
|
||||
<p>The format of the command is:
|
||||
|
||||
<tscreen><verb>
|
||||
rtstat [ OPTIONS ]
|
||||
</verb></tscreen>
|
||||
|
||||
<p> <tt/OPTIONS/ are:
|
||||
|
||||
<itemize>
|
||||
|
||||
<item><tt/-h/, <tt/-help/ - show help page and version of the utility.
|
||||
|
||||
<item><tt/-i INTERVAL/ - interval between snapshots, default value is
|
||||
2 seconds.
|
||||
|
||||
<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line,
|
||||
1 prescribes to print it once and 2 (this is default setting) forces header
|
||||
line each 20 lines.
|
||||
|
||||
</itemize>
|
||||
|
||||
</article>
|
||||
525
doc/ss.sgml
525
doc/ss.sgml
@ -1,525 +0,0 @@
|
||||
<!doctype linuxdoc system>
|
||||
|
||||
<article>
|
||||
|
||||
<title>SS Utility: Quick Intro
|
||||
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
|
||||
<date>some_negative_number, 20 Sep 2001
|
||||
<abstract>
|
||||
<tt/ss/ is one another utility to investigate sockets.
|
||||
Functionally it is NOT better than <tt/netstat/ combined
|
||||
with some perl/awk scripts and though it is surely faster
|
||||
it is not enough to make it much better. :-)
|
||||
So, stop reading this now and do not waste your time.
|
||||
Well, certainly, it proposes some functionality, which current
|
||||
netstat is still not able to do, but surely will soon.
|
||||
</abstract>
|
||||
|
||||
<sect>Why?
|
||||
|
||||
<p> <tt>/proc</tt> interface is inadequate, unfortunately.
|
||||
When amount of sockets is enough large, <tt/netstat/ or even
|
||||
plain <tt>cat /proc/net/tcp/</tt> cause nothing but pains and curses.
|
||||
In linux-2.4 the desease became worse: even if amount
|
||||
of sockets is small reading <tt>/proc/net/tcp/</tt> is slow enough.
|
||||
|
||||
This utility presents a new approach, which is supposed to scale
|
||||
well. I am not going to describe technical details here and
|
||||
will concentrate on description of the command.
|
||||
The only important thing to say is that it is not so bad idea
|
||||
to load module <tt/tcp_diag/, which can be found in directory
|
||||
<tt/Modules/ of <tt/iproute2/. If you do not make this <tt/ss/
|
||||
will work, but it falls back to <tt>/proc</tt> and becomes slow
|
||||
like <tt/netstat/, well, a bit faster yet (see section "Some numbers").
|
||||
|
||||
<sect>Old news
|
||||
|
||||
<p>
|
||||
In the simplest form <tt/ss/ is equivalent to netstat
|
||||
with some small deviations.
|
||||
|
||||
<itemize>
|
||||
<item><tt/ss -t -a/ dumps all TCP sockets
|
||||
<item><tt/ss -u -a/ dumps all UDP sockets
|
||||
<item><tt/ss -w -a/ dumps all RAW sockets
|
||||
<item><tt/ss -x -a/ dumps all UNIX sockets
|
||||
</itemize>
|
||||
|
||||
<p>
|
||||
Option <tt/-o/ shows TCP timers state.
|
||||
Option <tt/-e/ shows some extended information.
|
||||
Etc. etc. etc. Seems, all the options of netstat related to sockets
|
||||
are supported. Though not AX.25 and other bizarres. :-)
|
||||
If someone wants, he can make support for decnet and ipx.
|
||||
Some rudimentary support for them is already present in iproute2 libutils,
|
||||
and I will be glad to see these new members.
|
||||
|
||||
<p>
|
||||
However, standard functionality is a bit different:
|
||||
|
||||
<p>
|
||||
The first: without option <tt/-a/ sockets in states
|
||||
<tt/TIME-WAIT/ and <tt/SYN-RECV/ are skipped too.
|
||||
It is more reasonable default, I think.
|
||||
|
||||
<p>
|
||||
The second: format of UNIX sockets is different. It coincides
|
||||
with tcp/udp. Though standard kernel still does not allow to
|
||||
see write/read queues and peer address of connected UNIX sockets,
|
||||
the patch doing this exists.
|
||||
|
||||
<p>
|
||||
The third: default is to dump only TCP sockets, rather than all of the types.
|
||||
|
||||
<p>
|
||||
The next: by default it does not resolve numeric host addresses (like <tt/ip/)!
|
||||
Resolving is enabled with option <tt/-r/. Service names, usually stored
|
||||
in local files, are resolved by default. Also, if service database
|
||||
does not contain references to a port, <tt/ss/ queries system
|
||||
<tt/rpcbind/. RPC services are prefixed with <tt/rpc./
|
||||
Resolution of services may be suppressed with option <tt/-n/.
|
||||
|
||||
<p>
|
||||
It does not accept "long" options (I dislike them, sorry).
|
||||
So, address family is given with family identifier following
|
||||
option <tt/-f/ to be algined to iproute2 conventions.
|
||||
Mostly, it is to allow option parser to parse
|
||||
addresses correctly, but as side effect it really limits dumping
|
||||
to sockets supporting only given family. Option <tt/-A/ followed
|
||||
by list of socket tables to dump is also supported.
|
||||
Logically, id of socket table is different of _address_ family, which is
|
||||
another point of incompatibility. So, id is one of
|
||||
<tt/all/, <tt/tcp/, <tt/udp/,
|
||||
<tt/raw/, <tt/inet/, <tt/unix/, <tt/packet/, <tt/netlink/. See?
|
||||
Well, <tt/inet/ is just abbreviation for <tt/tcp|udp|raw/
|
||||
and it is not difficult to guess that <tt/packet/ allows
|
||||
to look at packet sockets. Actually, there are also some other abbreviations,
|
||||
f.e. <tt/unix_dgram/ selects only datagram UNIX sockets.
|
||||
|
||||
<p>
|
||||
The next: well, I still do not know. :-)
|
||||
|
||||
|
||||
|
||||
|
||||
<sect>Time to talk about new functionality.
|
||||
|
||||
<p>It is builtin filtering of socket lists.
|
||||
|
||||
<sect1> Filtering by state.
|
||||
|
||||
<p>
|
||||
<tt/ss/ allows to filter socket states, using keywords
|
||||
<tt/state/ and <tt/exclude/, followed by some state
|
||||
identifier.
|
||||
|
||||
<p>
|
||||
State identifier are standard TCP state names (not listed,
|
||||
they are useless for you if you already do not know them)
|
||||
or abbreviations:
|
||||
|
||||
<itemize>
|
||||
<item><tt/all/ - for all the states
|
||||
<item><tt/bucket/ - for TCP minisockets (<tt/TIME-WAIT|SYN-RECV/)
|
||||
<item><tt/big/ - all except for minisockets
|
||||
<item><tt/connected/ - not closed and not listening
|
||||
<item><tt/synchronized/ - connected and not <tt/SYN-SENT/
|
||||
</itemize>
|
||||
|
||||
<p>
|
||||
F.e. to dump all tcp sockets except <tt/SYN-RECV/:
|
||||
|
||||
<tscreen><verb>
|
||||
ss exclude SYN-RECV
|
||||
</verb></tscreen>
|
||||
|
||||
<p>
|
||||
If neither <tt/state/ nor <tt/exclude/ directives
|
||||
are present,
|
||||
state filter defaults to <tt/all/ with option <tt/-a/
|
||||
or to <tt/all/,
|
||||
excluding listening, syn-recv, time-wait and closed sockets.
|
||||
|
||||
<sect1> Filtering by addresses and ports.
|
||||
|
||||
<p>
|
||||
Option list may contain address/port filter.
|
||||
It is boolean expression which consists of boolean operation
|
||||
<tt/or/, <tt/and/, <tt/not/ and predicates.
|
||||
Actually, all the flavors of names for boolean operations are eaten:
|
||||
<tt/&/, <tt/&&/, <tt/|/, <tt/||/, <tt/!/, but do not forget
|
||||
about special sense given to these symbols by unix shells and escape
|
||||
them correctly, when used from command line.
|
||||
|
||||
<p>
|
||||
Predicates may be of the folowing kinds:
|
||||
|
||||
<itemize>
|
||||
<item>A. Address/port match, where address is checked against mask
|
||||
and port is either wildcard or exact. It is one of:
|
||||
|
||||
<tscreen><verb>
|
||||
dst prefix:port
|
||||
src prefix:port
|
||||
src unix:STRING
|
||||
src link:protocol:ifindex
|
||||
src nl:channel:pid
|
||||
</verb></tscreen>
|
||||
|
||||
Both prefix and port may be absent or replaced with <tt/*/,
|
||||
which means wildcard. UNIX socket use more powerful scheme
|
||||
matching to socket names by shell wildcards. Also, prefixes
|
||||
unix: and link: may be omitted, if address family is evident
|
||||
from context (with option <tt/-x/ or with <tt/-f unix/
|
||||
or with <tt/unix/ keyword)
|
||||
|
||||
<p>
|
||||
F.e.
|
||||
|
||||
<tscreen><verb>
|
||||
dst 10.0.0.1
|
||||
dst 10.0.0.1:
|
||||
dst 10.0.0.1/32:
|
||||
dst 10.0.0.1:*
|
||||
</verb></tscreen>
|
||||
are equivalent and mean socket connected to
|
||||
any port on host 10.0.0.1
|
||||
|
||||
<tscreen><verb>
|
||||
dst 10.0.0.0/24:22
|
||||
</verb></tscreen>
|
||||
sockets connected to port 22 on network
|
||||
10.0.0.0...255.
|
||||
|
||||
<p>
|
||||
Note that port separated of address with colon, which creates
|
||||
troubles with IPv6 addresses. Generally, we interpret the last
|
||||
colon as splitting port. To allow to give IPv6 addresses,
|
||||
trick like used in IPv6 HTTP URLs may be used:
|
||||
|
||||
<tscreen><verb>
|
||||
dst [::1]
|
||||
</verb></tscreen>
|
||||
are sockets connected to ::1 on any port
|
||||
|
||||
<p>
|
||||
Another way is <tt/dst ::1/128/. / helps to understand that
|
||||
colon is part of IPv6 address.
|
||||
|
||||
<p>
|
||||
Now we can add another alias for <tt/dst 10.0.0.1/:
|
||||
<tt/dst [10.0.0.1]/. :-)
|
||||
|
||||
<p> Address may be a DNS name. In this case all the addresses are looked
|
||||
up (in all the address families, if it is not limited by option <tt/-f/
|
||||
or special address prefix <tt/inet:/, <tt/inet6/) and resulting
|
||||
expression is <tt/or/ over all of them.
|
||||
|
||||
<item> B. Port expressions:
|
||||
<tscreen><verb>
|
||||
dport >= :1024
|
||||
dport != :22
|
||||
sport < :32000
|
||||
</verb></tscreen>
|
||||
etc.
|
||||
|
||||
All the relations: <tt/</, <tt/>/, <tt/=/, <tt/>=/, <tt/=/, <tt/==/,
|
||||
<tt/!=/, <tt/eq/, <tt/ge/, <tt/lt/, <tt/ne/...
|
||||
Use variant which you like more, but not forget to escape special
|
||||
characters when typing them in command line. :-)
|
||||
|
||||
Note that port number syntactically coincides to the case A!
|
||||
You may even add an IP address, but it will not participate
|
||||
incomparison, except for <tt/==/ and <tt/!=/, which are equivalent
|
||||
to corresponding predicates of type A. F.e.
|
||||
<p>
|
||||
<tt/dst 10.0.0.1:22/
|
||||
is equivalent to <tt/dport eq 10.0.0.1:22/
|
||||
and
|
||||
<tt/not dst 10.0.0.1:22/ is equivalent to
|
||||
<tt/dport neq 10.0.0.1:22/
|
||||
|
||||
<item>C. Keyword <tt/autobound/. It matches to sockets bound automatically
|
||||
on local system.
|
||||
|
||||
</itemize>
|
||||
|
||||
|
||||
<sect> Examples
|
||||
|
||||
<p>
|
||||
<itemize>
|
||||
<item>1. List all the tcp sockets in state <tt/FIN-WAIT-1/ for our apache
|
||||
to network 193.233.7/24 and look at their timers:
|
||||
|
||||
<tscreen><verb>
|
||||
ss -o state fin-wait-1 \( sport = :http or sport = :https \) \
|
||||
dst 193.233.7/24
|
||||
</verb></tscreen>
|
||||
|
||||
Oops, forgot to say that missing logical operation is
|
||||
equivalent to <tt/and/.
|
||||
|
||||
<item> 2. Well, now look at the rest...
|
||||
|
||||
<tscreen><verb>
|
||||
ss -o excl fin-wait-1
|
||||
ss state fin-wait-1 \( sport neq :http and sport neq :https \) \
|
||||
or not dst 193.233.7/24
|
||||
</verb></tscreen>
|
||||
|
||||
Note that we have to do _two_ calls of ss to do this.
|
||||
State match is always anded to address/port match.
|
||||
The reason for this is purely technical: ss does fast skip of
|
||||
not matching states before parsing addresses and I consider the
|
||||
ability to skip fastly gobs of time-wait and syn-recv sockets
|
||||
as more important than logical generality.
|
||||
|
||||
<item> 3. So, let's look at all our sockets using autobound ports:
|
||||
|
||||
<tscreen><verb>
|
||||
ss -a -A all autobound
|
||||
</verb></tscreen>
|
||||
|
||||
|
||||
<item> 4. And eventually find all the local processes connected
|
||||
to local X servers:
|
||||
|
||||
<tscreen><verb>
|
||||
ss -xp dst "/tmp/.X11-unix/*"
|
||||
</verb></tscreen>
|
||||
|
||||
Pardon, this does not work with current kernel, patching is required.
|
||||
But we still can look at server side:
|
||||
|
||||
<tscreen><verb>
|
||||
ss -x src "/tmp/.X11-unix/*"
|
||||
</verb></tscreen>
|
||||
|
||||
</itemize>
|
||||
|
||||
|
||||
<sect> Returning to ground: real manual
|
||||
|
||||
<p>
|
||||
<sect1> Command arguments
|
||||
|
||||
<p> General format of arguments to <tt/ss/ is:
|
||||
|
||||
<tscreen><verb>
|
||||
ss [ OPTIONS ] [ STATE-FILTER ] [ ADDRESS-FILTER ]
|
||||
</verb></tscreen>
|
||||
|
||||
<sect2><tt/OPTIONS/
|
||||
<p> <tt/OPTIONS/ is list of single letter options, using common unix
|
||||
conventions.
|
||||
|
||||
<itemize>
|
||||
<item><tt/-h/ - show help page
|
||||
<item><tt/-?/ - the same, of course
|
||||
<item><tt/-v/, <tt/-V/ - print version of <tt/ss/ and exit
|
||||
<item><tt/-s/ - print summary statistics. This option does not parse
|
||||
socket lists obtaining summary from various sources. It is useful
|
||||
when amount of sockets is so huge that parsing <tt>/proc/net/tcp</tt>
|
||||
is painful.
|
||||
<item><tt/-D FILE/ - do not display anything, just dump raw information
|
||||
about TCP sockets to <tt/FILE/ after applying filters. If <tt/FILE/ is <tt/-/
|
||||
<tt/stdout/ is used.
|
||||
<item><tt/-F FILE/ - read continuation of filter from <tt/FILE/.
|
||||
Each line of <tt/FILE/ is interpreted like single command line option.
|
||||
If <tt/FILE/ is <tt/-/ <tt/stdin/ is used.
|
||||
<item><tt/-r/ - try to resolve numeric address/ports
|
||||
<item><tt/-n/ - do not try to resolve ports
|
||||
<item><tt/-o/ - show some optional information, f.e. TCP timers
|
||||
<item><tt/-i/ - show some infomration specific to TCP (RTO, congestion
|
||||
window, slow start threshould etc.)
|
||||
<item><tt/-e/ - show even more optional information
|
||||
<item><tt/-m/ - show extended information on memory used by the socket.
|
||||
It is available only with <tt/tcp_diag/ enabled.
|
||||
<item><tt/-p/ - show list of processes owning the socket
|
||||
<item><tt/-f FAMILY/ - default address family used for parsing addresses.
|
||||
Also this option limits listing to sockets supporting
|
||||
given address family. Currently the following families
|
||||
are supported: <tt/unix/, <tt/inet/, <tt/inet6/, <tt/link/,
|
||||
<tt/netlink/.
|
||||
<item><tt/-4/ - alias for <tt/-f inet/
|
||||
<item><tt/-6/ - alias for <tt/-f inet6/
|
||||
<item><tt/-0/ - alias for <tt/-f link/
|
||||
<item><tt/-A LIST-OF-TABLES/ - list of socket tables to dump, separated
|
||||
by commas. The following identifiers are understood:
|
||||
<tt/all/, <tt/inet/, <tt/tcp/, <tt/udp/, <tt/raw/,
|
||||
<tt/unix/, <tt/packet/, <tt/netlink/, <tt/unix_dgram/,
|
||||
<tt/unix_stream/, <tt/packet_raw/, <tt/packet_dgram/.
|
||||
<item><tt/-x/ - alias for <tt/-A unix/
|
||||
<item><tt/-t/ - alias for <tt/-A tcp/
|
||||
<item><tt/-u/ - alias for <tt/-A udp/
|
||||
<item><tt/-w/ - alias for <tt/-A raw/
|
||||
<item><tt/-a/ - show sockets of all the states. By default sockets
|
||||
in states <tt/LISTEN/, <tt/TIME-WAIT/, <tt/SYN_RECV/
|
||||
and <tt/CLOSE/ are skipped.
|
||||
<item><tt/-l/ - show only sockets in state <tt/LISTEN/
|
||||
</itemize>
|
||||
|
||||
<sect2><tt/STATE-FILTER/
|
||||
|
||||
<p><tt/STATE-FILTER/ allows to construct arbitrary set of
|
||||
states to match. Its syntax is sequence of keywords <tt/state/
|
||||
and <tt/exclude/ followed by identifier of state.
|
||||
Available identifiers are:
|
||||
|
||||
<p>
|
||||
<itemize>
|
||||
<item> All standard TCP states: <tt/established/, <tt/syn-sent/,
|
||||
<tt/syn-recv/, <tt/fin-wait-1/, <tt/fin-wait-2/, <tt/time-wait/,
|
||||
<tt/closed/, <tt/close-wait/, <tt/last-ack/, <tt/listen/ and <tt/closing/.
|
||||
|
||||
<item><tt/all/ - for all the states
|
||||
<item><tt/connected/ - all the states except for <tt/listen/ and <tt/closed/
|
||||
<item><tt/synchronized/ - all the <tt/connected/ states except for
|
||||
<tt/syn-sent/
|
||||
<item><tt/bucket/ - states, which are maintained as minisockets, i.e.
|
||||
<tt/time-wait/ and <tt/syn-recv/.
|
||||
<item><tt/big/ - opposite to <tt/bucket/
|
||||
</itemize>
|
||||
|
||||
<sect2><tt/ADDRESS_FILTER/
|
||||
|
||||
<p><tt/ADDRESS_FILTER/ is boolean expression with operations <tt/and/, <tt/or/
|
||||
and <tt/not/, which can be abbreviated in C style f.e. as <tt/&/,
|
||||
<tt/&&/.
|
||||
|
||||
<p>
|
||||
Predicates check socket addresses, both local and remote.
|
||||
There are the following kinds of predicates:
|
||||
|
||||
<itemize>
|
||||
<item> <tt/dst ADDRESS_PATTERN/ - matches remote address and port
|
||||
<item> <tt/src ADDRESS_PATTERN/ - matches local address and port
|
||||
<item> <tt/dport RELOP PORT/ - compares remote port to a number
|
||||
<item> <tt/sport RELOP PORT/ - compares local port to a number
|
||||
<item> <tt/autobound/ - checks that socket is bound to an ephemeral
|
||||
port
|
||||
</itemize>
|
||||
|
||||
<p><tt/RELOP/ is some of <tt/<=/, <tt/>=/, <tt/==/ etc.
|
||||
To make this more convinient for use in unix shell, alphabetic
|
||||
FORTRAN-like notations <tt/le/, <tt/gt/ etc. are accepted as well.
|
||||
|
||||
<p>The format and semantics of <tt/ADDRESS_PATTERN/ depends on address
|
||||
family.
|
||||
|
||||
<itemize>
|
||||
<item><tt/inet/ - <tt/ADDRESS_PATTERN/ consists of IP prefix, optionally
|
||||
followed by colon and port. If prefix or port part is absent or replaced
|
||||
with <tt/*/, this means wildcard match.
|
||||
<item><tt/inet6/ - The same as <tt/inet/, only prefix refers to an IPv6
|
||||
address. Unlike <tt/inet/ colon becomes ambiguous, so that <tt/ss/ allows
|
||||
to use scheme, like used in URLs, where address is suppounded with
|
||||
<tt/[/ ... <tt/]/.
|
||||
<item><tt/unix/ - <tt/ADDRESS_PATTERN/ is shell-style wildcard.
|
||||
<item><tt/packet/ - format looks like <tt/inet/, only interface index
|
||||
stays instead of port and link layer protocol id instead of address.
|
||||
<item><tt/netlink/ - format looks like <tt/inet/, only socket pid
|
||||
stays instead of port and netlink channel instead of address.
|
||||
</itemize>
|
||||
|
||||
<p><tt/PORT/ is syntactically <tt/ADDRESS_PATTERN/ with wildcard
|
||||
address part. Certainly, it is undefined for UNIX sockets.
|
||||
|
||||
<sect1> Environment variables
|
||||
|
||||
<p>
|
||||
<tt/ss/ allows to change source of information using various
|
||||
environment variables:
|
||||
|
||||
<p>
|
||||
<itemize>
|
||||
<item> <tt/PROC_SLABINFO/ to override <tt>/proc/slabinfo</tt>
|
||||
<item> <tt/PROC_NET_TCP/ to override <tt>/proc/net/tcp</tt>
|
||||
<item> <tt/PROC_NET_UDP/ to override <tt>/proc/net/udp</tt>
|
||||
<item> etc.
|
||||
</itemize>
|
||||
|
||||
<p>
|
||||
Variable <tt/PROC_ROOT/ allows to change root of all the <tt>/proc/</tt>
|
||||
hierarchy.
|
||||
|
||||
<p>
|
||||
Variable <tt/TCPDIAG_FILE/ prescribes to open a file instead of
|
||||
requesting kernel to dump information about TCP sockets.
|
||||
|
||||
|
||||
<p> This option is used mainly to investigate bug reports,
|
||||
when dumps of files usually found in <tt>/proc/</tt> are recevied
|
||||
by e-mail.
|
||||
|
||||
<sect1> Output format
|
||||
|
||||
<p>Six columns. The first is <tt/Netid/, it denotes socket type and
|
||||
transport protocol, when it is ambiguous: <tt/tcp/, <tt/udp/, <tt/raw/,
|
||||
<tt/u_str/ is abbreviation for <tt/unix_stream/, <tt/u_dgr/ for UNIX
|
||||
datagram sockets, <tt/nl/ for netlink, <tt/p_raw/ and <tt/p_dgr/ for
|
||||
raw and datagram packet sockets. This column is optional, it will
|
||||
be hidden, if filter selects an unique netid.
|
||||
|
||||
<p>
|
||||
The second column is <tt/State/. Socket state is displayed here.
|
||||
The names are standard TCP names, except for <tt/UNCONN/, which
|
||||
cannot happen for TCP, but normal for not connected sockets
|
||||
of another types. Again, this column can be hidden.
|
||||
|
||||
<p>
|
||||
Then two columns (<tt/Recv-Q/ and <tt/Send-Q/) showing amount of data
|
||||
queued for receive and transmit.
|
||||
|
||||
<p>
|
||||
And the last two columns display local address and port of the socket
|
||||
and its peer address, if the socket is connected.
|
||||
|
||||
<p>
|
||||
If options <tt/-o/, <tt/-e/ or <tt/-p/ were given, options are
|
||||
displayed not in fixed positions but separated by spaces pairs:
|
||||
<tt/option:value/. If value is not a single number, it is presented
|
||||
as list of values, enclosed to <tt/(/ ... <tt/)/ and separated with
|
||||
commas. F.e.
|
||||
|
||||
<tscreen><verb>
|
||||
timer:(keepalive,111min,0)
|
||||
</verb></tscreen>
|
||||
is typical format for TCP timer (option <tt/-o/).
|
||||
|
||||
<tscreen><verb>
|
||||
users:((X,113,3))
|
||||
</verb></tscreen>
|
||||
is typical for list of users (option <tt/-p/).
|
||||
|
||||
|
||||
<sect>Some numbers
|
||||
|
||||
<p>
|
||||
Well, let us use <tt/pidentd/ and a tool <tt/ibench/ to measure
|
||||
its performance. It is 30 requests per second here. Nothing to test,
|
||||
it is too slow. OK, let us patch pidentd with patch from directory
|
||||
Patches. After this it handles about 4300 requests per second
|
||||
and becomes handy tool to pollute socket tables with lots of timewait
|
||||
buckets.
|
||||
|
||||
<p>
|
||||
So, each test starts from pollution tables with 30000 sockets
|
||||
and then doing full dump of the table piped to wc and measuring
|
||||
timings with time:
|
||||
|
||||
<p>Results:
|
||||
|
||||
<itemize>
|
||||
<item> <tt/netstat -at/ - 15.6 seconds
|
||||
<item> <tt/ss -atr/, but without <tt/tcp_diag/ - 5.4 seconds
|
||||
<item> <tt/ss -atr/ with <tt/tcp_diag/ - 0.47 seconds
|
||||
</itemize>
|
||||
|
||||
No comments. Though one comment is necessary, most of time
|
||||
without <tt/tcp_diag/ is wasted inside kernel with completely
|
||||
blocked networking. More than 10 seconds, yes. <tt/tcp_diag/
|
||||
does the same work for 100 milliseconds of system time.
|
||||
|
||||
</article>
|
||||
@ -1,514 +0,0 @@
|
||||
\documentclass[12pt,twoside]{article}
|
||||
|
||||
\usepackage[hidelinks]{hyperref} % \url
|
||||
\usepackage{booktabs} % nicer tabulars
|
||||
\usepackage{fancyvrb}
|
||||
\usepackage{fullpage}
|
||||
\usepackage{float}
|
||||
|
||||
\newcommand{\iface}{\textit}
|
||||
\newcommand{\cmd}{\texttt}
|
||||
\newcommand{\man}{\textit}
|
||||
\newcommand{\qdisc}{\texttt}
|
||||
\newcommand{\filter}{\texttt}
|
||||
|
||||
\begin{document}
|
||||
\title{QoS in Linux with TC and Filters}
|
||||
\author{Phil Sutter (phil@nwl.cc)}
|
||||
\date{January 2016}
|
||||
\maketitle
|
||||
|
||||
Standard practice when transmitting packets over a medium which may block (due
|
||||
to congestion, e.g.) is to use a queue which temporarily holds these packets. In
|
||||
Linux, this queueing approach is where QoS happens: A Queueing Discipline
|
||||
(qdisc) holds multiple packet queues with different priorities for dequeueing to
|
||||
the network driver. The classification (i.e. deciding which queue a packet
|
||||
should go into) is typically done based on Type Of Service (IPv4) or Traffic
|
||||
Class (IPv6) header fields but depending on qdisc implementation, might be
|
||||
controlled by the user as well.
|
||||
|
||||
Qdiscs come in two flavors, classful or classless. While classless qdiscs are
|
||||
not as flexible as classful ones, they also require much less customizing. Often
|
||||
it is enough to just attach them to an interface, without exact knowledge of
|
||||
what is done internally. Classful qdiscs are the exact opposite: flexible in
|
||||
application, they are often not even usable without insightful configuration.
|
||||
|
||||
As the name implies, classful qdiscs provide configurable classes to sort
|
||||
traffic into. In it's basic form, this is not much different than, say, the
|
||||
classless \qdisc{pfifo\_fast} which holds three queues and classifies per
|
||||
packet upon priority field. Though typically classes go beyond that by
|
||||
supporting nesting and additional characteristics like e.g. maximum traffic
|
||||
rate or quantum.
|
||||
|
||||
When it comes to controlling the classification process, filters come into play.
|
||||
They attach to the parent of a set of classes (i.e. either the qdisc itself or
|
||||
a parent class) and specify how a packet (or it's associated flow) has to look
|
||||
like in order to suit a given class. To overcome this simplification, it is
|
||||
possible to attach multiple filters to the same parent, which then consults each
|
||||
of them in row until the first one accepts the packet.
|
||||
|
||||
Before getting into detail about what filters there are and how to use them, a
|
||||
simple setup of a qdisc with classes is necessary:
|
||||
\begin{figure}[H]
|
||||
\begin{Verbatim}
|
||||
.-------------------------------------------------------.
|
||||
| |
|
||||
| HTB |
|
||||
| |
|
||||
| .----------------------------------------------------.|
|
||||
| | ||
|
||||
| | Class 1:1 ||
|
||||
| | ||
|
||||
| | .---------------..---------------..---------------.||
|
||||
| | | || || |||
|
||||
| | | Class 1:10 || Class 1:20 || Class 1:30 |||
|
||||
| | | || || |||
|
||||
| | | .------------.|| .------------.|| .------------.|||
|
||||
| | | | ||| | ||| | ||||
|
||||
| | | | fq_codel ||| | fq_codel ||| | fq_codel ||||
|
||||
| | | | ||| | ||| | ||||
|
||||
| | | '------------'|| '------------'|| '------------'|||
|
||||
| | '---------------''---------------''---------------'||
|
||||
| '----------------------------------------------------'|
|
||||
'-------------------------------------------------------'
|
||||
\end{Verbatim}
|
||||
\end{figure}
|
||||
\noindent
|
||||
The following commands establish the basic setup shown:
|
||||
\begin{Verbatim}
|
||||
(1) # tc qdisc replace dev eth0 root handle 1: htb default 30
|
||||
(2) # tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit
|
||||
(3) # alias tclass='tc class add dev eth0 parent 1:1'
|
||||
(4) # tclass classid 1:10 htb rate 1mbit ceil 20mbit prio 1
|
||||
(4) # tclass classid 1:20 htb rate 90mbit ceil 95mbit prio 2
|
||||
(4) # tclass classid 1:30 htb rate 1mbit ceil 95mbit prio 3
|
||||
(5) # tc qdisc add dev eth0 parent 1:10 fq_codel
|
||||
(5) # tc qdisc add dev eth0 parent 1:20 fq_codel
|
||||
(5) # tc qdisc add dev eth0 parent 1:30 fq_codel
|
||||
\end{Verbatim}
|
||||
A little explanation for the unfamiliar reader:
|
||||
\begin{enumerate}
|
||||
\item Replace the root qdisc of \iface{eth0} by an instance of \qdisc{HTB}.
|
||||
Specifying the handle is necessary so it can be referenced in consecutive
|
||||
calls to \cmd{tc}. The default class for unclassified traffic is set to
|
||||
30.
|
||||
\item Create a single top-level class with handle 1:1 which limits the total
|
||||
bandwidth allowed to 95mbit/s. It is assumed that \iface{eth0} is a 100mbit/s link,
|
||||
staying a little below that helps to keep the main point of enqueueing in
|
||||
the qdisc layer instead of the interface hardware queue or at another
|
||||
bottleneck in the network.
|
||||
\item Define an alias for the common part of the remaining three calls in order
|
||||
to improve readability. This means all remaining classes are attached to the
|
||||
common parent class from (2).
|
||||
\item Create three child classes for different uses: Class 1:10 has highest
|
||||
priority but is tightly limited in bandwidth - fine for interactive
|
||||
connections. Class 1:20 has mid priority and high guaranteed bandwidth, for
|
||||
high priority bulk traffic. Finally, there's the default class 1:30 with
|
||||
lowest priority, low guaranteed bandwidth and the ability to use the full
|
||||
link in case it's unused otherwise. This should be fine for uninteresting
|
||||
traffic not explicitly taken care of.
|
||||
\item Attach a leaf qdisc to each of the child classes created in (4). Since
|
||||
\qdisc{HTB} by default attaches \qdisc{pfifo} as leaf qdisc, this step is optional. Still,
|
||||
the fairness between different flows provided by the classless \qdisc{fq\_codel} is
|
||||
worth the effort.
|
||||
\end{enumerate}
|
||||
More information about the qdiscs and fine-tuning parameters can be found in
|
||||
\man{tc-htb(8)} and \man{tc-fq\_codel(8)}.
|
||||
|
||||
Without any additional setup done, now all traffic leaving \iface{eth0} is shaped to
|
||||
95mbit/s and directed through class 1:30. This can be verified by looking at the
|
||||
\texttt{Sent} field of the class statistics printed via \cmd{tc -s class show dev eth0}:
|
||||
Only the root class 1:1 and it's child 1:30 should show any traffic.
|
||||
|
||||
|
||||
\section*{Finally time to start filtering!}
|
||||
|
||||
Let's begin with a simple one, i.e. reestablishing what \qdisc{pfifo\_fast} did
|
||||
automatically based on TOS/Priority field. Linux internally translates the
|
||||
header field into the priority field of struct skbuff, which
|
||||
\qdisc{pfifo\_fast} uses for
|
||||
classification. \man{tc-prio(8)} contains a table listing the priority (and
|
||||
ultimately, \qdisc{pfifo\_fast} queue index) each TOS value is being translated into.
|
||||
Here is a shorter version:
|
||||
\begin{center}
|
||||
\begin{tabular}{lll}
|
||||
TOS Values & Linux Priority (Number) & Queue Index \\
|
||||
\midrule
|
||||
0x0 - 0x6 & Best Effort (0) & 1 \\
|
||||
0x8 - 0xe & Bulk (2) & 2 \\
|
||||
0x10 - 0x16 & Interactive (6) & 0 \\
|
||||
0x18 - 0x1e & Interactive Bulk (4) & 1 \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
Using the \filter{basic} filter, it is possible to match packets based on that skbuff
|
||||
field, which has the added benefit of being IP version agnostic. Since the
|
||||
\qdisc{HTB} setup above defaults to class ID 1:30, the Bulk priority can be
|
||||
ignored. The \filter{basic} filter allows to combine matches, therefore we get along
|
||||
with only two filters:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent 1: basic \
|
||||
match 'meta(priority eq 6)' classid 1:10
|
||||
# tc filter add dev eth0 parent 1: basic \
|
||||
match 'meta(priority eq 0)' \
|
||||
or 'meta(priority eq 4)' classid 1:20
|
||||
\end{Verbatim}
|
||||
A detailed description of the \filter{basic} filter and the ematch syntax it uses can be
|
||||
found in \man{tc-basic(8)} and \man{tc-ematch(8)}.
|
||||
|
||||
Obviously, this first example cries for optimization. A simple one would be to
|
||||
just change the default class from 1:30 to 1:20, so filters are only needed for
|
||||
Bulk and Interactive priorities:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent 1: basic \
|
||||
match 'meta(priority eq 6)' classid 1:10
|
||||
# tc filter add dev eth0 parent 1: basic \
|
||||
match 'meta(priority eq 2)' classid 1:20
|
||||
\end{Verbatim}
|
||||
Given that class IDs are random, choosing them wisely allows for a direct
|
||||
mapping. So first, recreate the qdisc and classes configuration:
|
||||
\begin{Verbatim}
|
||||
# tc qdisc replace dev eth0 root handle 1: htb default 10
|
||||
# tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit
|
||||
# alias tclass='tc class add dev eth0 parent 1:1'
|
||||
# tclass classid 1:16 htb rate 1mbit ceil 20mbit prio 1
|
||||
# tclass classid 1:10 htb rate 90mbit ceil 95mbit prio 2
|
||||
# tclass classid 1:12 htb rate 1mbit ceil 95mbit prio 3
|
||||
# tc qdisc add dev eth0 parent 1:16 fq_codel
|
||||
# tc qdisc add dev eth0 parent 1:10 fq_codel
|
||||
# tc qdisc add dev eth0 parent 1:12 fq_codel
|
||||
\end{Verbatim}
|
||||
This is basically identical to above, but with changed leaf class IDs and the
|
||||
second priority class being the default. Using the \filter{flow} filter with it's \texttt{map}
|
||||
functionality, a single filter command is enough:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent 1: handle 0x1337 flow \
|
||||
map key priority baseclass 1:10
|
||||
\end{Verbatim}
|
||||
The \filter{flow} filter now uses the priority value to construct a destination class ID
|
||||
by adding it to the value of \texttt{baseclass}. While this works for priority values of
|
||||
0, 2 and 6, it will result in non-existent class ID 1:14 for Interactive Bulk
|
||||
traffic. In that case, the \qdisc{HTB} default applies so that traffic goes into class
|
||||
ID 1:10 just as intended. Please note that specifying a handle is a mandatory
|
||||
requirement by the \filter{flow} filter, although I didn't see where one would use that
|
||||
later. For more information about \filter{flow}, see \man{tc-flow(8)}.
|
||||
|
||||
While \filter{flow} and \filter{basic} filters are relatively easy to apply and understand, they
|
||||
are as well quite limited to their intended purpose. A more flexible option is
|
||||
the \filter{u32} filter, which allows to match on arbitrary parts of the packet data -
|
||||
yet only on that, not any meta data associated to it by the kernel (with the
|
||||
exception of firewall mark value). So in order to continue this little
|
||||
exercise with \filter{u32}, we have to base classification directly upon the actual TOS
|
||||
value. An intuitive attempt might look like this:
|
||||
\begin{Verbatim}
|
||||
# alias tcfilter='tc filter add dev eth0 parent 1:'
|
||||
# tcfilter u32 match ip dsfield 0x10 0x1e classid 1:16
|
||||
# tcfilter u32 match ip dsfield 0x12 0x1e classid 1:16
|
||||
# tcfilter u32 match ip dsfield 0x14 0x1e classid 1:16
|
||||
# tcfilter u32 match ip dsfield 0x16 0x1e classid 1:16
|
||||
# tcfilter u32 match ip dsfield 0x8 0x1e classid 1:12
|
||||
# tcfilter u32 match ip dsfield 0xa 0x1e classid 1:12
|
||||
# tcfilter u32 match ip dsfield 0xc 0x1e classid 1:12
|
||||
# tcfilter u32 match ip dsfield 0xe 0x1e classid 1:12
|
||||
\end{Verbatim}
|
||||
The obvious drawback here is the amount of filters needed. And without the
|
||||
default class, eight more filters would be necessary. This also has performance
|
||||
implications: A packet with TOS value 0xe will be checked eight times in total
|
||||
in order to determine it's destination class. While there's not much to be done
|
||||
about the number of filters, at least the performance problem can be eliminated
|
||||
by using \filter{u32}'s hash table support:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent 1: prio 99 handle 1: u32 divisor 16
|
||||
\end{Verbatim}
|
||||
This creates a hash table with 16 buckets. The table size is arbitrary, but not
|
||||
random: Since the first bit of the TOS field is not interesting, it can be
|
||||
ignored and therefore the range of values to consider is just [0;15], i.e. a
|
||||
number of 16 different values. The next step is to populate the hash table:
|
||||
\begin{Verbatim}
|
||||
# alias tcfilter='tc filter add dev eth0 parent 1: prio 99'
|
||||
# tcfilter u32 match u8 0 0 ht 1:0: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:1: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:2: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:3: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:4: classid 1:12
|
||||
# tcfilter u32 match u8 0 0 ht 1:5: classid 1:12
|
||||
# tcfilter u32 match u8 0 0 ht 1:6: classid 1:12
|
||||
# tcfilter u32 match u8 0 0 ht 1:7: classid 1:12
|
||||
# tcfilter u32 match u8 0 0 ht 1:8: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:9: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:a: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:b: classid 1:16
|
||||
# tcfilter u32 match u8 0 0 ht 1:c: classid 1:10
|
||||
# tcfilter u32 match u8 0 0 ht 1:d: classid 1:10
|
||||
# tcfilter u32 match u8 0 0 ht 1:e: classid 1:10
|
||||
# tcfilter u32 match u8 0 0 ht 1:f: classid 1:10
|
||||
\end{Verbatim}
|
||||
The parameter \texttt{ht} denotes the hash table and bucket the filter should be added
|
||||
to. Since the first TOS bit is ignored, it's value has to be divided by two in
|
||||
order to get to the bucket it maps to. E.g. a TOS value of 0x10 will therefore
|
||||
map to bucket 0x8. For the sake of completeness, all possible values are mapped
|
||||
and therefore a configurable default class is not required. Note that the used
|
||||
match expression is not necessary, but mandatory. Therefore anything that
|
||||
matches any packet will suffice. Finally, a filter which links to the defined
|
||||
hash table is needed:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent 1: prio 1 protocol ip u32 \
|
||||
link 1: hashkey mask 0x001e0000 match u8 0 0
|
||||
\end{Verbatim}
|
||||
Here again, the actual match statement is not necessary, but syntactically
|
||||
required. All the magic lies within the \texttt{hashkey} parameter, which defines which
|
||||
part of the packet should be used directly as hash key. Here's a drawing of the
|
||||
first four bytes of the IPv4 header, with the area selected by \texttt{hashkey mask}
|
||||
highlighted:
|
||||
\begin{figure}[H]
|
||||
\begin{Verbatim}
|
||||
0 1 2 3
|
||||
.-----------------------------------------------------------------.
|
||||
| | | ######## | | |
|
||||
| Version| IHL | #DSCP### | ECN| Total Length |
|
||||
| | | ######## | | |
|
||||
'-----------------------------------------------------------------'
|
||||
\end{Verbatim}
|
||||
\end{figure}
|
||||
\noindent
|
||||
This may look confusing at first, but keep in mind that bit- as well as
|
||||
byte-ordering here is LSB while the mask value is written in MSB we humans use.
|
||||
Therefore reading the mask is done like so, starting from left:
|
||||
\begin{enumerate}
|
||||
\item Skip the first byte (which contains Version and IHL fields).
|
||||
\item Skip the lowest bit of the second byte (0x1e is even).
|
||||
\item Mark the four following bits (0x1e is 11110 in binary).
|
||||
\item Skip the remaining three bits of the second byte as well as the remaining two
|
||||
bytes.
|
||||
\end{enumerate}
|
||||
Before doing the lookup, the kernel right-shifts the masked value by the amount
|
||||
of zero-bits in \texttt{mask}, which implicitly also does the division by two which the
|
||||
hash table depends on. With this setup, every packet has to pass exactly two
|
||||
filters to be classified. Note that this filter is limited to IPv4 packets: Due
|
||||
to the related Traffic Class field being at a different offset in the packet, it
|
||||
would not work for IPv6. To use the same setup for IPv6 as well, a second
|
||||
entry-level filter is necessary:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent 1: prio 2 protocol ipv6 u32 \
|
||||
link 1: hashkey mask 0x01e00000 match u8 0 0
|
||||
\end{Verbatim}
|
||||
For illustration purposes, here again is a drawing of the first four bytes of
|
||||
the IPv6 header, again with masked area highlighted:
|
||||
\begin{figure}[H]
|
||||
\begin{Verbatim}
|
||||
0 1 2 3
|
||||
.-----------------------------------------------------------------.
|
||||
| | ######## | |
|
||||
| Version| #Traffic Class| Flow Label |
|
||||
| | ######## | |
|
||||
'-----------------------------------------------------------------'
|
||||
\end{Verbatim}
|
||||
\end{figure}
|
||||
\noindent
|
||||
Reading the mask value is analogous to IPv4 with the added complexity that
|
||||
Traffic Class spans over two bytes. Yet, for comparison there's a simple trick:
|
||||
IPv6 has the interesting field shifted by four bits to the left, and the new
|
||||
mask's value is shifted by the same amount. For further information about
|
||||
\filter{u32} and what can be done with it, consult it's man page
|
||||
\man{tc-u32(8)}.
|
||||
|
||||
Of course, the kernel provides many more filters than just \filter{basic},
|
||||
\filter{flow} and \filter{u32} which have been presented above. As of now, the
|
||||
remaining ones are:
|
||||
\begin{description}
|
||||
\item[bpf]
|
||||
Filtering using Berkeley Packet Filter programs. The program's return
|
||||
code determines the packet's destination class ID.
|
||||
|
||||
\item[cgroup]
|
||||
Filter packets based on control groups. This is only useful for packets
|
||||
originating from the local host, as control groups only exist in that
|
||||
scope.
|
||||
|
||||
\item[flower]
|
||||
An extended variant of the flow filter.
|
||||
|
||||
\item[fw]
|
||||
Matches on firewall mark values previously assigned to the packet by
|
||||
netfilter (or a filter action, see below for details). This allows to
|
||||
export the classification algorithm into netfilter, which is very
|
||||
convenient if appropriate rules exist on the same system in there
|
||||
already.
|
||||
|
||||
\item[route]
|
||||
Filter packets based on matching routing table entry. Basically
|
||||
equivalent to the \texttt{fw} filter above, to make use of an already existing
|
||||
extensive routing table setup.
|
||||
|
||||
\item[rsvp, rsvp6]
|
||||
Implementation of the Resource Reservation Protocol in Linux, to react
|
||||
upon requests sent by an RSVP daemon.
|
||||
|
||||
\item[tcindex]
|
||||
Match packets based on tcindex value, which is usually set by the dsmark
|
||||
qdisc. This is part of an approach to support Differentiated Services in
|
||||
Linux, which is another topic on it's own.
|
||||
\end{description}
|
||||
|
||||
|
||||
\section*{Filter Actions}
|
||||
|
||||
The tc filter framework provides the infrastructure to another extensible set of
|
||||
tools as well, namely tc actions. As the name suggests, they allow to do things
|
||||
with packets (or associated data). (The list of) Actions are part of a given
|
||||
filter. If it matches, each action it contains is executed in order before
|
||||
returning the classification result. Since the action has direct access to the
|
||||
latter, it is in theory possible for an action to react upon or even change the
|
||||
filtering result - as long as the packet matched, of course. Yet none of the
|
||||
currently in-tree actions make use of this.
|
||||
|
||||
The Generic Actions framework originally evolved out of the filters' ability to
|
||||
police traffic to a given maximum bandwidth. One common use case for that is to
|
||||
limit ingress traffic, dropping packets which exceed the threshold. A classic
|
||||
setup example is like so:
|
||||
\begin{Verbatim}
|
||||
# tc qdisc add dev eth0 handle ffff: ingress
|
||||
# tc filter add dev eth0 parent ffff: u32 \
|
||||
match u32 0 0
|
||||
police rate 1mbit burst 100k
|
||||
\end{Verbatim}
|
||||
The ingress qdisc is not a real one, but merely a point of reference for filters
|
||||
to attach to which should get applied to incoming traffic. The \filter{u32} filter added
|
||||
above matches on any packet and therefore limits the total incoming bandwidth to
|
||||
1mbit/s, allowing bursts of up to 100kbytes. Using the new syntax, the filter
|
||||
command changes slightly:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent ffff: u32 \
|
||||
match u32 0 0 \
|
||||
action police rate 1mbit burst 100k
|
||||
\end{Verbatim}
|
||||
The important detail is that this syntax allows to define multiple actions.
|
||||
E.g. for testing purposes, it is possible to redirect exceeding traffic to the
|
||||
loopback interface instead of dropping it:
|
||||
\begin{Verbatim}
|
||||
# tc filter add dev eth0 parent ffff: u32 \
|
||||
match u32 0 0 \
|
||||
action police rate 1mbit burst 100k conform-exceed pipe \
|
||||
action mirred egress redirect dev lo
|
||||
\end{Verbatim}
|
||||
The added parameter \texttt{conform-exceed pipe} tells the police action to allow for
|
||||
further actions to handle the exceeding packet.
|
||||
|
||||
Apart from \texttt{police} and \texttt{mirred} actions, there are a few more. Here's a full
|
||||
list of the currently implemented ones:
|
||||
\begin{description}
|
||||
\item[bpf]
|
||||
Apply a Berkeley Packet Filter program to the packet.
|
||||
|
||||
\item[connmark]
|
||||
Set the packet's firewall mark to that of it's connection. This works by
|
||||
searching the conntrack table for a matching entry. If found, the mark
|
||||
is restored.
|
||||
|
||||
\item[csum]
|
||||
Trigger recalculation of packet checksums. The supported protocols are:
|
||||
IPv4, ICMP, IGMP, TCP, UDP and UDPLite.
|
||||
|
||||
\item[ipt]
|
||||
Pass the packet to an iptables target. This allows to use iptables
|
||||
extensions directly instead of having to go the extra mile via setting
|
||||
an arbitrary firewall mark and matching on that from within netfilter.
|
||||
|
||||
\item[mirred]
|
||||
Mirror or redirect packets. This is often combined with the ifb pseudo
|
||||
device to share a common QoS setup between multiple interfaces or even
|
||||
ingress traffic.
|
||||
|
||||
\item[nat]
|
||||
Perform stateless Native Address Translation. This is certainly not
|
||||
complete and therefore inferior to NAT using iptables: Although the
|
||||
kernel module decides between TCP, UDP and ICMP traffic, it does not
|
||||
handle typical problematic protocols such as active FTP or SIP.
|
||||
|
||||
\item[pedit]
|
||||
Generic packet editing. This allows to alter arbitrary bytes of the
|
||||
packet, either by specifying an offset into the packet or by naming a
|
||||
packet header and field name to change. Currently, the latter is
|
||||
implemented only for IPv4 yet.
|
||||
|
||||
\item[police]
|
||||
Apply a bandwidth rate limiting policy. Packets exceeding it are dropped
|
||||
by default, but may optionally be handled differently.
|
||||
|
||||
\item[simple]
|
||||
This is rather an example than real action. All it does is print a
|
||||
user-defined string together with a packet counter. Useful maybe for
|
||||
debugging when filter statistics are not available or too complicated.
|
||||
|
||||
\item[skbedit]
|
||||
Edit associated packet data, supports changing queue mapping, priority
|
||||
field and firewall mark value.
|
||||
|
||||
\item[vlan]
|
||||
Add/remove a VLAN header to/from the packet. This might serve as
|
||||
alternative to using 802.1Q pseudo-interfaces in combination with
|
||||
routing rules when e.g. packets for a given destination need to be
|
||||
encapsulated.
|
||||
\end{description}
|
||||
|
||||
|
||||
\section*{Intermediate Functional Block}
|
||||
|
||||
The Intermediate Functional Block (\texttt{ifb}) pseudo network interface acts as a QoS
|
||||
concentrator for multiple different sources of traffic. Packets from or to other
|
||||
interfaces have to be redirected to it using the \texttt{mirred} action in order to be
|
||||
handled, regularly routed traffic will be dropped. This way, a single stack of
|
||||
qdiscs, classes and filters can be shared between multiple interfaces.
|
||||
|
||||
Here's a simple example to feed incoming traffic from multiple interfaces
|
||||
through a Stochastic Fairness Queue (\qdisc{sfq}):
|
||||
\begin{Verbatim}
|
||||
(1) # modprobe ifb
|
||||
(2) # ip link set ifb0 up
|
||||
(3) # tc qdisc add dev ifb0 root sfq
|
||||
\end{Verbatim}
|
||||
The first step is to load the \texttt{ifb} kernel module (1). By default, this will
|
||||
create two ifb devices: \iface{ifb0} and \iface{ifb1}. After setting
|
||||
\iface{ifb0} up in (2), the root
|
||||
qdisc is replaced by \qdisc{sfq} in (3). Finally, one can start redirecting ingress
|
||||
traffic to \iface{ifb0}, e.g. from \iface{eth0}:
|
||||
\begin{Verbatim}
|
||||
# tc qdisc add dev eth0 handle ffff: ingress
|
||||
# tc filter add dev eth0 parent ffff: u32 \
|
||||
match u32 0 0 \
|
||||
action mirred egress redirect dev ifb0
|
||||
\end{Verbatim}
|
||||
The same can be done for other interfaces, just replacing \iface{eth0} in the two
|
||||
commands above. One thing to keep in mind here is the asymmetrical routing this
|
||||
creates within the host doing the QoS: Incoming packets enter the system via
|
||||
\iface{ifb0}, while corresponding replies leave directly via \iface{eth0}. This can be observed
|
||||
using \cmd{tcpdump} on \iface{ifb0}, which shows the input part of the traffic only. What's
|
||||
more confusing is that \cmd{tcpdump} on \iface{eth0} shows both incoming and outgoing traffic,
|
||||
but the redirection is still effective - a simple prove is setting
|
||||
\iface{ifb0} down,
|
||||
which will interrupt the communication. Obviously \cmd{tcpdump} catches the packets to
|
||||
dump before they enter the ingress qdisc, which is why it sees them while the
|
||||
kernel itself doesn't.
|
||||
|
||||
|
||||
\section*{Conclusion}
|
||||
|
||||
Once the steep learning curve has been mastered, the conglomerate of (classful)
|
||||
qdiscs, filters and actions provides a highly sophisticated and flexible
|
||||
infrastructure to perform QoS, which plays nicely along with routing and
|
||||
firewalling setups.
|
||||
|
||||
|
||||
\section*{Further Reading}
|
||||
|
||||
A good starting point for novice users and experienced ones diving into unknown
|
||||
areas is the extensive HOWTO at \url{http://lartc.org}. The iproute2 package ships
|
||||
some examples (usually in /usr/share/doc/, depending on distribution) as well as
|
||||
man pages for \cmd{tc} in general, qdiscs and filters. The latter have been added
|
||||
just recently though, so if your distribution does not ship iproute2 version
|
||||
4.3.0 yet, these are not in there. Apart from that, the internet is a spring of
|
||||
HOWTOs and scripts people wrote - though these should be taken with a grain of
|
||||
salt: The complexity of the matter often leads to copying others' solutions
|
||||
without much validation, which allows for less optimal or even obsolete
|
||||
implementations to survive much longer than desired.
|
||||
|
||||
\end{document}
|
||||
@ -29,13 +29,11 @@ enum output_type {
|
||||
PRINT_ANY = 4,
|
||||
};
|
||||
|
||||
void new_json_obj(int json, FILE *fp);
|
||||
void new_json_obj(int json);
|
||||
void delete_json_obj(void);
|
||||
|
||||
bool is_json_context(void);
|
||||
|
||||
void set_current_fp(FILE *fp);
|
||||
|
||||
void fflush_fp(void);
|
||||
|
||||
void open_json_object(const char *str);
|
||||
|
||||
@ -1815,7 +1815,7 @@ static int ipaddr_showdump(void)
|
||||
if (ipadd_dump_check_magic())
|
||||
exit(-1);
|
||||
|
||||
new_json_obj(json, stdout);
|
||||
new_json_obj(json);
|
||||
open_json_object(NULL);
|
||||
open_json_array(PRINT_JSON, "addr_info");
|
||||
|
||||
@ -2176,7 +2176,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action)
|
||||
* Initialize a json_writer and open an array object
|
||||
* if -json was specified.
|
||||
*/
|
||||
new_json_obj(json, stdout);
|
||||
new_json_obj(json);
|
||||
|
||||
/*
|
||||
* If only filter_dev present and none of the other
|
||||
|
||||
@ -16,15 +16,14 @@
|
||||
#include "json_print.h"
|
||||
|
||||
static json_writer_t *_jw;
|
||||
static FILE *_fp;
|
||||
|
||||
#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw)
|
||||
#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY))
|
||||
|
||||
void new_json_obj(int json, FILE *fp)
|
||||
void new_json_obj(int json)
|
||||
{
|
||||
if (json) {
|
||||
_jw = jsonw_new(fp);
|
||||
_jw = jsonw_new(stdout);
|
||||
if (!_jw) {
|
||||
perror("json object");
|
||||
exit(1);
|
||||
@ -32,7 +31,6 @@ void new_json_obj(int json, FILE *fp)
|
||||
jsonw_pretty(_jw, true);
|
||||
jsonw_start_array(_jw);
|
||||
}
|
||||
set_current_fp(fp);
|
||||
}
|
||||
|
||||
void delete_json_obj(void)
|
||||
@ -48,15 +46,6 @@ bool is_json_context(void)
|
||||
return _jw != NULL;
|
||||
}
|
||||
|
||||
void set_current_fp(FILE *fp)
|
||||
{
|
||||
if (!fp) {
|
||||
fprintf(stderr, "Error: invalid file pointer.\n");
|
||||
exit(1);
|
||||
}
|
||||
_fp = fp;
|
||||
}
|
||||
|
||||
json_writer_t *get_json_writer(void)
|
||||
{
|
||||
return _jw;
|
||||
@ -89,7 +78,7 @@ void open_json_array(enum output_type type, const char *str)
|
||||
jsonw_name(_jw, str);
|
||||
jsonw_start_array(_jw);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
fprintf(_fp, "%s", str);
|
||||
printf("%s", str);
|
||||
}
|
||||
}
|
||||
|
||||
@ -103,7 +92,7 @@ void close_json_array(enum output_type type, const char *str)
|
||||
jsonw_end_array(_jw);
|
||||
jsonw_pretty(_jw, true);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
fprintf(_fp, "%s", str);
|
||||
printf("%s", str);
|
||||
}
|
||||
}
|
||||
|
||||
@ -124,7 +113,7 @@ void close_json_array(enum output_type type, const char *str)
|
||||
else \
|
||||
jsonw_##type_name##_field(_jw, key, value); \
|
||||
} else if (_IS_FP_CONTEXT(t)) { \
|
||||
color_fprintf(_fp, color, fmt, value); \
|
||||
color_fprintf(stdout, color, fmt, value); \
|
||||
} \
|
||||
}
|
||||
_PRINT_FUNC(int, int);
|
||||
@ -147,7 +136,7 @@ void print_color_string(enum output_type type,
|
||||
else
|
||||
jsonw_string_field(_jw, key, value);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
color_fprintf(_fp, color, fmt, value);
|
||||
color_fprintf(stdout, color, fmt, value);
|
||||
}
|
||||
}
|
||||
|
||||
@ -168,7 +157,7 @@ void print_color_bool(enum output_type type,
|
||||
else
|
||||
jsonw_bool(_jw, value);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
color_fprintf(_fp, color, fmt, value ? "true" : "false");
|
||||
color_fprintf(stdout, color, fmt, value ? "true" : "false");
|
||||
}
|
||||
}
|
||||
|
||||
@ -187,7 +176,7 @@ void print_color_0xhex(enum output_type type,
|
||||
snprintf(b1, sizeof(b1), "%#x", hex);
|
||||
print_string(PRINT_JSON, key, NULL, b1);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
color_fprintf(_fp, color, fmt, hex);
|
||||
color_fprintf(stdout, color, fmt, hex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -206,7 +195,7 @@ void print_color_hex(enum output_type type,
|
||||
else
|
||||
jsonw_string(_jw, b1);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
color_fprintf(_fp, color, fmt, hex);
|
||||
color_fprintf(stdout, color, fmt, hex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -226,6 +215,6 @@ void print_color_null(enum output_type type,
|
||||
else
|
||||
jsonw_null(_jw);
|
||||
} else if (_IS_FP_CONTEXT(type)) {
|
||||
color_fprintf(_fp, color, fmt, value);
|
||||
color_fprintf(stdout, color, fmt, value);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user