I’ve spent a lot of time trying to get our logging to behave sensibly with all the microservices we have running in Kubernetes. Some of our apps log in JSON, some log with positional parameters, some log with a key=value, some use a mixed format, where part of the message is positional, and the rest of the message is in JSON (who does that?). Below is the configuration I’ve been able to come up with and refined over a few months for Kubernetes fluentd log message parsing.
I’ve found that the multi_format plugin for fluentd is a great way to parse these logs into a sensible format that Splunk can then ingest. Here are the config files I’ve been using so far. This first one is mostly just copied from the kubernetes_metadata plugin’s documentation. The meat of the format parsing is in the parsers.conf snippet.
<match fluent.**>
@type null
</match>
#I observed journal logs going to both /run/log/journal and /var/log/journal
<source>
@type systemd
path /var/log/journal
read_from_head false
<storage>
type local
persistent true
path /var/log/fluent/journal.pos
</storage>
tag journal
</source>
<source>
@type systemd
path /run/log/journal
read_from_head false
<storage>
type local
persistent true
path /var/log/fluent/runjournal.pos
</storage>
tag journal
</source>
#Tag Kubernetes containers
<match journal>
@type rewrite_tag_filter
<rule>
key CONTAINER_NAME
pattern ^k8s_([\w-]*?)[\._]
tag kubernetes.journal.container.$1
</rule>
<rule>
key SYSLOG_IDENTIFIER
pattern ^(.*)
tag journal.$1
</rule>
</match>
<source>
@type tail
format /^(?<time>[^ ]* [^ ,]*)[^\[]*\[[^\]]*\]\[(?<severity>[^ \]]*) *\] (?<msg>.*)$/
time_format %Y-%m-%d %H:%M:%S
path /var/log/salt/minion
pos_file /var/log/fluent/fluentd-salt.pos
tag salt
</source>
<source>
@type tail
format syslog
path /var/log/startupscript.log
pos_file /var/log/fluent/fluentd-startupscript.log.pos
tag startupscript
</source>
<source>
@type tail
format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<msg>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
path /var/log/docker.log
pos_file /var/log/fluent/fluentd-docker.log.pos
tag docker
</source>
<source>
@type tail
format none
path /var/log/etcd.log
pos_file /var/log/fluent/fluentd-etcd.log.pos
tag etcd
</source>
#Pull k8s deployment annotations in
<filter kubernetes.journal.container.**>
@type kubernetes_metadata
annotation_match /^exampleCompany\..*$/
</filter>
#parse out the various log formats
<filter kubernetes.journal.container.**>
@type parser
key_name MESSAGE
reserve_data true
<parse>
@type multi_format
@include parsers.conf
</parse>
</filter>
#Some record munging to remove duplicate keys, and have one single key for the log message itself, and a plethora of log levels or severities
<filter kubernetes.journal.container>
@type record_transformer
enable_ruby true
remove_keys hostname,name,msg
<record>
message ${record["msg"] ? record["msg"] : record["message"] ? record["message"] : record["MESSAGE"]}
level ${record["level"] ? record["level"] == 10 ? "TRACE" : record["level"] == 20 ? "DEBUG" : record["level"] == 30 ? "INFO" : record["level"] == 40 ? "WARN" : record["level"] == 50 ? "ERROR" : record["level"] == 60 ? "FATAL" : record["level"] : (record["PRIORITY"] ? record["PRIORITY"] == "0" ? "EMERG" : record["PRIORITY"] == "1" ? "ALERT" : record["PRIORITY"] == "2" ? "CRIT" : record["PRIORITY"] == "3" ? "ERROR" : record["PRIORITY"] == "4" ? "WARN" : record["PRIORITY"] == "5" ? "NOTICE" : record["PRIORITY"] == "6" ? "INFO" : record["PRIORITY"] == "7" ? "DEBUG" : record["PRIORITY"] : record["severity"])}
</record>
</filter>
Next, here are some of the various patterns that we have to deal with.
<pattern>
format regexp
#Catalina.out logs
expression /(?<time>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2},\d{3}Z) \[catalina\] (?<level>\w*) +\[(?<thread>.*?)\] (?<logger>.*?) - (?<msg>.*)$/
time_format "%FT%T,%L%Z"
</pattern>
<pattern>
format regexp
#Logback with MDC logs (. in time)
expression /^(?<time>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z) \[(?<thread>.*)\] (?<level>\w*) *(?<logger>.*?) \[(?<mdc>[^\]]*)\] : (?<msg>.*)$/
time_format "%FT%T.%L%Z"
</pattern>
<pattern>
format regexp
#Logback with MDC logs (, in time)
expression /^(?<time>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2},\d{3}Z) \[(?<thread>.*)\] (?<level>\w*) (?<logger>.*?) \[(?<mdc>[^\]]*)\] : (?<msg>.*)$/
time_format "%FT%T,%L%Z"
</pattern>
<pattern>
format regexp
#logback framework output
expression /(?<time>\d{2}:\d{2}:\d{2},\d{3}) \|-(?<level>\w*) in (?<logger>[^ ]*) - (?<msg>.*)$/
time_format "%T,%L"
</pattern>
<pattern>
format regexp
#shortDate level [thread] logger - message
expression /^(?<time>\d{2}:\d{2}:\d{2},\d{3}) *(?<level>\w*) \[(?<thread>[^\]]*)\] (?<logger>[^ ]*) - (?<msg>.*)$/
time_format "%T,%L"
</pattern>
<pattern>
format regexp
#java PrintCommandLineFlags
expression /^(?<msg>(?:(?:-XX:\+(?<logger>PrintCommandLineFlags) )|(?:-XX:[+\w]*(?:=\d*)? *))*)$/
</pattern>
<pattern>
format multi_time_json
time_format "%FT%T.%L%Z"
</pattern>
<pattern>
format none
</pattern>
These patterns are “executed” in order, from top to bottom, and the first one that matches is used, so order your patterns from most specific to most generic. These patterns are just to pull out the interesting metadata from the log records for better categorization and searching, or just to get the time format interpreted correctly.
Nice post. Thanks for the detailed sharing!