Commit 81e637c0 authored by Jérome Perrin's avatar Jérome Perrin

software/grafana: version up, split servers and agent

Split the instances in two:
 - "default" instance is grafana, loki (for logs) and influxdb (for
metrics)
 - "agent" instance is telegraf collecting metrics and logs and sending
 it the the "default" instance.

Next steps will be that the agent becomes not used, instead the
slapos instances will be able to push metrics or logs directly, probably
using fluentbit and sending to either loki/influxdb or wendelin.
parent 5efd3eb8
Pipeline #37486 failed with stage
in 0 seconds
......@@ -15,28 +15,27 @@
[instance-profile]
filename = instance.cfg.in
md5sum = 8c9dc41c176ba01116de5b71aaa704de
md5sum = 32c772c593d2c3c38c26186b91b78cf8
[instance-default]
filename = instance-default.cfg.in
md5sum = b4330fbe0c9c3631f4f477c06d3460b3
[instance-agent]
filename = instance-agent.cfg.in
md5sum = 6bbc97cf8e752d22773d5f23ecdda37d
[influxdb-config-file]
filename = influxdb-config-file.cfg.in
md5sum = a28972ced3e0f4aa776e43a9c44717c0
[telegraf-config-file]
filename = telegraf-config-file.cfg.in
md5sum = a1a9c22c2a7829c66a49fc2504604d21
[grafana-config-file]
filename = grafana-config-file.cfg.in
md5sum = e255dcca466f5de51698d24cbd114577
md5sum = 2b75d6b1984d9d154303ec773aa88474
[grafana-provisioning-config-file]
filename = grafana-provisioning-config-file.cfg.in
md5sum = 3aa0f1ed752b2a59ea2b5e7c1733daf3
[grafana-provisioning-dashboards-config-file]
filename = grafana-provisioning-dashboards-config-file.cfg.in
md5sum = 5616679a9c5c2757540175ead3f5500a
[loki-config-file]
filename = loki-config-file.cfg.in
md5sum = ad2baf4599a937d7352034a41fa24814
[promtail-config-file]
filename = promtail-config-file.cfg.in
md5sum = 5f1b3a1a3d3f98daeab4780106452d71
......@@ -154,7 +154,7 @@ reporting_enabled = true
# in some UI views to notify that grafana or plugin update exists
# This option does not cause any auto updates, nor send any information
# only a GET request to https://grafana.com to get latest versions
check_for_updates = true
check_for_updates = false
# Google Analytics universal tracking code, only enabled if you specify an id here
google_analytics_ua_id =
......@@ -334,23 +334,21 @@ allow_sign_up = true
#################################### SMTP / Emailing #####################
[smtp]
{% set email = slapparameter_dict.get('email', {}) %}
#enabled = false
enabled = {{ slapparameter_dict.get('smtp-server') and 'true' or 'false' }}
enabled = {{ email.get('smtp-server') and 'true' or 'false' }}
#host = locahost:25
host = {{ slapparameter_dict.get('smtp-server', '') }}
host = {{ email.get('smtp-server', '') }}
#user =
user = {{ slapparameter_dict.get('smtp-username', '') }}
user = {{ email.get('smtp-username', '') }}
# If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;"""
#password =
password = {{ slapparameter_dict.get('smtp-password', '') and '"""%s"""' % slapparameter_dict['smtp-password'] or ""}}
password = {{ email.get('smtp-password', '') and '"""%s"""' % email['smtp-password'] or ""}}
cert_file =
key_file =
#skip_verify = false
skip_verify = {{ slapparameter_dict.get('smtp-verify-ssl', 'true').lower() == 'true' and 'false' or 'true' }}
#from_address = admin@grafana.localhost
from_address = {{ slapparameter_dict.get('email-from-address', '') }}
#from_name = Grafana
from_name = {{ slapparameter_dict.get('email-from-name', 'Grafana') }}
skip_verify = {{ email.get('smtp-verify-ssl') and 'false' or 'true' }}
from_address = {{ email.get('email-from-address', '') }}
from_name = {{ email.get('email-from-name', 'Grafana') }}
ehlo_identity =
[emails]
......
# https://grafana.com/docs/administration/provisioning/#example-datasource-config-file
apiVersion: 1
datasources:
- name: telegraf
type: influxdb
access: proxy
url: {{ influxdb['url'] }}
user: {{ influxdb['auth-username'] }}
database: telegraf
isDefault: true
jsonData:
tlsSkipVerify: true
secureJsonData:
password: {{ influxdb['auth-password'] }}
version: 1
editable: false
- name: loki
type: loki
access: proxy
url: {{ loki['url'] }}
version: 1
editable: false
# https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
apiVersion: 1
providers:
- name: SlapOS
folder: ''
updateIntervalSeconds: 10
allowUiUpdates: false
options:
path: {{ dashboards_dir }}
{
"$schema": "https://json-schema.org/draft/2019-09/schema",
"description": "Parameters to instantiate an agent collecting logs and metrics",
"type": "object",
"additionalProperties": false,
"unevaluatedProperties": false,
"$defs": {
"type": {
"description": "Type of the application. With `SlapOS` type, some metrics are collected from supervisor and from some known partition types (for example: ERP5's mariadb or ERP5's zopes). With `system` type, only log files are ingested.",
"type": "string",
"default": "SlapOS",
"enum": [
"SlapOS",
"system"
]
},
"name": {
"description": "Name of this application",
"type": "string"
},
"urls": {
"description": "URLs to monitor for availability and certificate lifetime",
"type": "array",
"items": {
"type": "string"
}
},
"log-file-patterns": {
"type": "array",
"items": {
"type": "string"
},
"description": "Glob patterns for watched log files."
},
"static-tags": {
"type": "object",
"description": "Static tags for this partition",
"examples": [
{
"service-level": "production",
"data-center": "abc123"
}
]
}
},
"required": [
"applications",
"influxdb",
"loki"
],
"properties": {
"applications": {
"description": "Applications to monitor",
"type": "array",
"items": {
"oneOf": [
{
"type": "object",
"additionalProperties": false,
"description": "Configuration for SlapOS type application",
"required": [
"type",
"name",
"instance-root",
"partitions"
],
"properties": {
"type": {
"$ref": "#/$defs/type",
"const": "SlapOS"
},
"name": {
"$ref": "#/$defs/name"
},
"urls": {
"$ref": "#/$defs/urls"
},
"instance-root": {
"description": "Directory containing SlapOS partitions.",
"type": "string",
"examples": [
"/srv/slapgrid/",
"/srv/slapgrid/slappart30/srv/runner/instance/"
]
},
"partitions": {
"description": "SlapOS partitions to monitor",
"type": "array",
"items": {
"type": "object",
"required": [
"name",
"reference"
],
"unevaluatedProperties": false,
"properties": {
"name": {
"type": "string",
"description": "Friendly name of the partition",
"examples": [
"mariadb",
"zope-activity"
]
},
"reference": {
"type": "string",
"description": "Reference of the partition",
"examples": [
"slappart1",
"slappart2"
]
},
"type": {
"type": "string",
"description": "Type of the partition. Known types have metrics and logs collected",
"enum": [
"erp5/mariadb",
"erp5/balancer",
"erp5/zope-activity",
"erp5/zope-front",
"erp5/zeo",
"mariadb",
"default"
],
"default": "default"
},
"log-file-patterns": {
"$ref": "#/$defs/log-file-patterns",
"description": "Glob pattern for log files to watch. This mostly makes sense for `default` partition type. `{partition_root_directory}` python `.format`-style substitution variable is supported."
},
"static-tags": {
"$ref": "#/$defs/static-tags"
}
},
"allOf": [
{
"if": {
"properties": {
"type": {
"enum": [
"mariadb",
"erp5/mariadb"
]
}
}
},
"then": {
"properties": {
"dbname": {
"type": "string",
"description": "Database name"
},
"username": {
"type": "string",
"description": "Username to connect to database"
}
}
}
}
],
"examples": [
{
"name": "zope-backoffice",
"type": "erp5/zope-front",
"reference": "slappart1",
"static-tags": {
"instance": "instance-name"
}
},
{
"name": "mariadb",
"type": "erp5/mariadb",
"reference": "slappart2"
},
{
"name": "Theia",
"type": "default",
"log-file-patterns": [
"{partition_root_directory}/.slappart*log"
]
}
]
}
}
}
},
{
"type": "object",
"additionalProperties": false,
"description": "Configuration for `system` type application",
"required": [
"type",
"name"
],
"properties": {
"type": {
"$ref": "#/$defs/type",
"const": "system"
},
"name": {
"$ref": "#/$defs/name"
},
"urls": {
"$ref": "#/$defs/urls"
},
"partitions": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"name": {
"type": "string",
"description": "Friendly name of the partition",
"examples": [
"syslog",
"email"
]
},
"log-file-patterns": {
"$ref": "#/$defs/log-file-patterns"
},
"static-tags": {
"$ref": "#/$defs/static-tags"
}
},
"examples": [
{
"name": "syslog",
"log-file-patterns": [
"/var/log/syslog"
]
},
{
"name": "kernel",
"log-file-patterns": [
"/var/log/kern.log",
"/var/log/messages"
]
},
{
"name": "re6stnet",
"log-file-patterns": [
"/var/log/re6stnet/*.log"
]
}
]
}
}
}
}
]
}
},
"influxdb": {
"description": "Connection information for influxdb",
"type": "object",
"additionalProperties": false,
"required": [
"url",
"database",
"username",
"password"
],
"properties": {
"url": {
"description": "IPv6 URL of influxdb HTTP endpoint",
"format": "uri",
"type": "string"
},
"database": {
"description": "database created in influxdb",
"type": "string"
},
"username": {
"description": "username for influxdb",
"type": "string"
},
"password": {
"description": "password for influxdb user",
"type": "string"
}
}
},
"loki": {
"description": "Connection information for loki",
"type": "object",
"additionalProperties": false,
"required": [
"url",
"caucase-url"
],
"properties": {
"url": {
"description": "Base URL of Loki",
"format": "uri",
"type": "string"
},
"caucase-url": {
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string"
}
}
}
}
}
{
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by agent instantiation",
"additionalProperties": false,
"properties": {
"telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"type": "string"
},
"promtail-url": {
"description": "URL of embedded server from promtail",
"format": "uri",
"type": "string"
},
"facl-script": {
"description": "Path of a generated script to set ACL for the agent to access files and sockets. This might be needed depending on how slapos partitions were formatted",
"type": "string"
}
},
"type": "object"
}
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
eggs-directory = {{ buildout_eggs_directory }}
develop-eggs-directory = {{ buildout_develop_eggs_directory }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail
# macros
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout_parts_directory }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb-server]
recipe = slapos.recipe.build
slapparameter-dict = ${slap-configuration:configuration}
init =
import urllib.parse
influxdb = options['slapparameter-dict']['influxdb']
options['url'] = influxdb['url']
options['database'] = influxdb['database']
options['auth-username'] = influxdb['username']
options['auth-password'] = influxdb['password']
parsed_url = urllib.parse.urlparse(options['url'])
options['hostname'] = parsed_url.hostname
options['port'] = str(parsed_url.port)
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb-server:hostname}
port = ${influxdb-server:port}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
# telegraf needs influxdb to be already listening before starting
command-line =
bash -c '${influxdb-listen-promise:path} && ${:nice} {{ telegraf_bin }} --config ${telegraf-config-file:output} --config-directory ${:extra-config-dir}'
wrapper-path = ${directory:service}/telegraf
hash-files = ${telegraf-config-file:output}
# TODO: control nice of the agent ?
{% if 0 %}
nice = nice -19 chrt --idle 0 ionice -c3
{% else %}
nice =
{% endif %}
[telegraf-config-file]
recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration}
input-socket = ${directory:var}/tg.sock
init =
import zc.buildout
import pkg_resources
buildout_options = self.buildout["buildout"]
zc.buildout.easy_install.install(
["toml"],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options["develop-eggs-directory"],
buildout_options["eggs-directory"]])
import collections
import pathlib
import urllib.parse
import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb-server']
self._config_files = {} # files to create during install step
access_path_dict = {}
inputs = collections.defaultdict(list)
processors = collections.defaultdict(list)
config = {
"agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True, # TODO
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
for application in slapparameter_dict.get("applications", []):
partition_mapping = {}
partition_root_directory = ''
for partition in application.get("partitions", []):
partition.setdefault("type", "default")
if "reference" in partition:
partition_mapping[partition["reference"]] = partition["name"]
if application.get("instance-root"):
partition_root_directory = pathlib.Path(application["instance-root"]) / partition['reference']
if partition["type"] in ("erp5/mariadb", "mariadb"):
partition.setdefault("username", "root")
partition.setdefault("dbname", "erp5")
mariadb_socket = f"{partition_root_directory}/var/run/mariadb.sock"
dsn = f"{partition['username']}@unix({mariadb_socket})/{partition['dbname']}"
access_path_dict[mariadb_socket] = 'rw'
inputs["mysql"].append(
{
"name_override": "mariadb",
"servers": [dsn],
"gather_innodb_metrics": True,
"gather_slave_status": True,
"mariadb_dialect": True,
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/mariadb":
inputs["sql"].append(
{
"name_override": "mariadb_activities",
"driver": "mysql",
"dsn": dsn,
"query": [
{
"query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select 'message' as cmf_activity_queue, count(*) as failed_message_count
from message where processing_node between -2 and -10
union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""",
"field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as waiting_time, 'message_queue' as cmf_activity_queue
from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""",
"field_columns_include": ["waiting_time"],
"tag_columns_include": ["cmf_activity_queue"],
},
],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/balancer":
# XXX this produces many measurements
haproxy_socket = f"{partition_root_directory}/var/run/ha.sock"
access_path_dict[haproxy_socket] = 'rw'
inputs["haproxy"].append(
{
"servers": [haproxy_socket],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
})
urls = application.get("urls", [])
if urls:
inputs["http_response"].append({
"interval": "5m",
"urls": urls,
"tags": {"app": application["name"]},
})
for url in urls:
x509_url = url
parsed_url = urllib.parse.urlparse(url)
if parsed_url.scheme == 'https':
# x509_cert wants a port
if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = str(
pathlib.Path(self.options['location'])
/ f"telegraf-input-slapos-{application['name']}.cfg"
)
self._config_files[telegraf_slapos_input_config_file] = toml.dumps({
"inputs": {
"slapos": [{
"instance_root": application["instance-root"]}]}})
access_path_dict[f"{application['instance-root']}/sv.sock"] = 'rw'
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({
"name_override": "slapos_services",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]},
})
# drop measurements for not monitored partitions.
processors["starlark"].append({
"namepass": ["slapos_services"],
"tagpass": {"app": [application["name"]]},
"order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": ["slapos_services"],
"tagpass": {"app": [application["name"]]},
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": ["slapos_services"],
"tagpass": {"app": [application["name"]]},
"order": 3,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-on-watch$",
"replacement": "$" + "{1}",
}]})
processors["regex"].append({
"namepass": ["slapos_services"],
"tagpass": {"app": [application["name"]]},
"order": 4,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}",
}]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": ["slapos_services"],
"tagpass": {"app": [application["name"]]},
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({
"namepass": ["slapos_services"],
"tagpass": {"app": [application["name"]]},
"order": 6,
"mapping": [{
"tag": "partition_reference",
"dest": "name",
"value_mappings": partition_mapping,
}]})
# add a socket input so that we can have a promise verifying that telegraf is listening
inputs['socket_listener'].append({"service_address": f"unix://{self.options['input-socket']}"})
options['access-path-dict'] = access_path_dict
self._config_files[options['output']] = toml.dumps(config)
install =
import os
os.mkdir(self.options['location'])
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
[loki-server]
recipe = slapos.recipe.build
slapparameter-dict = ${slap-configuration:configuration}
init =
loki = options['slapparameter-dict']['loki']
options['url'] = loki['url']
options['caucase-url'] = loki['caucase-url']
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
distinguished_name = dn
[ dn ]
CN = ${:cn}
L = ${slap-connection:computer-id}
O = ${slap-connection:partition-id}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
cn = loki ${slap-connection:partition-id}@${slap-connection:computer-id}
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-server:caucase-url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[promtail]
recipe = slapos.cookbook:wrapper
command-line = ${:nice} {{ promtail_bin }} -config.file=${promtail-config-file:location}
wrapper-path = ${directory:service}/promtail
hash-files =
${promtail-config-file:location}
# TODO: control nice of the agent ?
{% if 0 %}
nice = nice -19 chrt --idle 0 ionice -c3
{% else %}
nice =
{% endif %}
dir = ${directory:promtail-dir}
http-port = 19080
grpc-port = 19095
ip = ${instance-parameter:ipv4-random}
url = http://${:ip}:${:http-port}
[promtail-config-file]
recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.yaml
slapparameter-dict = ${slap-configuration:configuration}
depends = ${loki-promtail-client-certificate:recipe}
{% raw %}
init =
import pathlib
import json
slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
loki_certificate = self.buildout['loki-promtail-client-certificate']
self._config_files = {} # files to create during install step
access_path_dict = {}
cfg = {
"server": {
"http_listen_address": self.buildout['promtail']['ip'],
"http_listen_port": int(self.buildout['promtail']['http-port']),
"grpc_listen_address": self.buildout['promtail']['ip'],
"grpc_listen_port": int(self.buildout['promtail']['grpc-port']),
"graceful_shutdown_timeout": 5,
"external_url": self.buildout['promtail']['url'],
},
"positions": {
"filename": "{}/positions.yaml".format(self.buildout['promtail']['dir']),
},
"clients": [
{
"url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": loki_certificate['ca-file'],
"cert_file": loki_certificate['cert-file'],
"key_file": loki_certificate['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s"
}
],
"scrape_configs": []
}
def get_job_selector(partition, job_name, application_name):
# make a selector in LogQL, like '{job="job_name",key="value"}'
selector_parts = []
for k, v in dict(
partition.get('static-tags', {}),
app=application_name,
job=job_name
).items():
selector_parts.append(f'{k}="{v}"')
return "{%s}" % ",".join(selector_parts)
def get_static_configs(partition, job_name, path_list, application):
if not isinstance(path_list, list):
raise ValueError(f'{path_list!r} is not a list')
partition_root_directory = ''
if partition.get('reference') and 'instance-root' in application:
instance_root = pathlib.Path(application['instance-root'])
partition_root_directory = instance_root / partition['reference']
path_list = [path.format(partition_root_directory=partition_root_directory) for path in path_list]
for path in path_list:
access_path_dict[path] = 'r'
partition_kw = {}
if partition.get('reference'):
partition_kw['partition_reference'] = partition['reference']
return [
{
"targets": [
"localhost"
],
"labels": dict(
partition.get('static-tags', {}),
job=job_name,
app=application['name'],
name=partition['name'],
computer_id=slap_connection['computer-id'],
__path__=path,
**partition_kw
)
} for path in path_list
]
for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []):
partition.setdefault("type", "default")
if partition['type'] in ('erp5/zope-activity', 'erp5/zope-front'):
# job name include the app name because they need to be unique
job_name = f"{application['name']}-{partition['name']}-event-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# TODO this does not seem to work well
"firstline": "^------",
"max_wait_time": "5s"
}
},
{
"regex": {
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/zope-*-event.log"],
application,
)})
if partition['type'] == 'erp5/zope-front':
job_name = f"{application['name']}-{partition['name']}-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
# drop requests for haproxy health check
"pipeline_stages": [
{
"drop": {
"expression": '.* "GET / HTTP/1.0" 200 .*'
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/zope-*-Z2.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-long-request-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}",
"max_wait_time": "5s"
}
},
{
"regex": {
"expression": "^(?P<timestamp>.*) .*"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/longrequest_logger_zope-*.log"],
application,
)})
if partition['type'] in ('erp5/mariadb', 'mariadb'):
job_name = f"{application['name']}-{partition['name']}-mariadb-slow-queries"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# between each slow query, slow query log has a first line like:
# # Time: 231008 16:29:01
# and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "5s"
}
},
{
"regex": {
"expression": ".*SET timestamp=(?P<timestamp>\\d+);.*"
}
},
{
"timestamp": {
"format": "Unix",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/mariadb_slowquery.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-mariadb-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"timestamp": {
"format": "2021-06-05 3:55:31",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/mariadb_error.log"],
application,
)})
if partition['type'] == 'erp5/zeo':
job_name = f"{application['name']}-{partition['name']}-zeo-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "5s"
}
},
{
"regex": {
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None,
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/zeo-*.log"],
application,
)})
if partition['type'] == 'erp5/balancer':
job_name = f"{application['name']}-{partition['name']}-balancer-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/apache-access.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-balancer-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
["{partition_root_directory}/var/log/apache-error.log"],
application,
)})
if partition.get('log-file-patterns'):
job_name = f"{application['name']}-{partition['name']}"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
partition['log-file-patterns'],
application,
)})
self._config_files[options['location']] = json.dumps(cfg, indent=2)
options['access-path-dict'] = access_path_dict
install =
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
{% endraw %}
[promtail-listen-promise]
<= check-port-listening-promise
hostname = ${promtail:ip}
port = ${promtail:http-port}
[telegraf-listen-promise]
recipe = slapos.cookbook:wrapper
command-line =
test -S ${telegraf-config-file:input-socket}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[facl-script]
recipe = slapos.recipe.build
promtail-access-path-dict = ${promtail-config-file:access-path-dict}
telegraf-access-path-dict = ${telegraf-config-file:access-path-dict}
install =
import itertools
import os
import pathlib
import pwd
import shlex
user = pwd.getpwuid(os.getuid()).pw_name
script_code = ''
def quote_path(p):
# quote, but preserve *
p = str(p)
assert '__STAR__' not in p
p = p.replace('*', '__STAR__')
p = shlex.quote(p)
p = p.replace('__STAR__', '*')
return p
# make sure we can access the parents folders
parent_access = {}
def check_parent_access(path):
parent = path.parent
if parent != path:
parent_access[str(parent)] = 'x'
check_parent_access(parent)
for path_spec, access in itertools.chain(
options['promtail-access-path-dict'].items(),
options['telegraf-access-path-dict'].items()):
path = pathlib.Path(path_spec)
check_parent_access(path)
for path_spec, access in sorted(itertools.chain(
options['promtail-access-path-dict'].items(),
options['telegraf-access-path-dict'].items(),
parent_access.items())):
path = pathlib.Path(path_spec)
if '*' in path_spec:
script_code += f'setfacl --modify=u:{user}:rx {quote_path(path.parent)}\n'
script_code += f'setfacl --modify=u:{user}:{access} {quote_path(path)}\n'
pathlib.Path(location).write_text(script_code)
[promises]
recipe =
instance-promises =
${promtail-listen-promise:path}
${telegraf-listen-promise:wrapper-path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
telegraf-extra-config-dir = ${telegraf:extra-config-dir}
facl-script = ${facl-script:location}
promtail-url = ${promtail:url}
{
"$schema": "https://json-schema.org/draft/2019-09/schema",
"description": "Parameters to instantiate Grafana",
"type": "object",
"additionalProperties": false,
"properties": {
"email": {
"type": "object",
"description": "Email configuration",
"additionalProperties": false,
"properties": {
"smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
"type": "string"
},
"smtp-username": {
"description": "Username to connect to SMTP server",
"type": "string"
},
"smtp-password": {
"description": "Password to connect to SMTP server",
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify certificate of SMTP server",
"type": "boolean",
"default": true
},
"email-from-address": {
"description": "Email address used in `From:` header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in `From:` header of emails",
"default": "Grafana",
"type": "string"
}
}
},
"frontend": {
"type": "object",
"additionalProperties": false,
"properties": {
"custom-domain": {
"description": "Custom domain to use when requesting a rapid-cdn frontend",
"type": "string",
"format": "hostname"
}
}
},
"caucase": {
"type": "object",
"description": "Caucase configuration. To connect external agents, it's required to approve their client certificates, either using an external caucase referenced as `external-caucase-url` or registering a user with `user-auto-approve-count`",
"additionalProperties": false,
"properties": {
"external-caucase-url": {
"description": "URL of a caucase instance to manage all server and clients certificates, to use instead of embedding caucase",
"type": "string",
"format": "uri"
},
"user-auto-approve-count": {
"description": "Number of users to automatically approve in the embedded caucase",
"type": "integer",
"default": 0
}
}
},
"influxdb": {
"description": "Fine tuning influxdb parameters",
"type": "object",
"additionalProperties": false,
"properties": {
"default-retention-policy-days": {
"description": "Number of days to keep metrics data",
"default": 720,
"type": "integer"
}
}
},
"loki": {
"description": "Fine tuning loki parameters",
"type": "object",
"additionalProperties": false,
"properties": {
"retention-period-days": {
"description": "Number of days to keep log data",
"default": 60,
"type": "integer"
}
}
},
"agent": {
"type": "object",
"properties": {
"applications": {
"$ref": "./instance-agent-input-schema.json#properties/applications"
}
}
}
}
}
{
"$schema": "http://json-schema.org/draft-04/schema#",
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by Grafana instantiation",
"additionalProperties": false,
"properties": {
"url": {
"description": "Shared frontend for this Grafana instance",
"pattern": "^https://",
"format": "uri",
"type": "string"
},
"grafana-username": {
......@@ -18,12 +17,12 @@
},
"grafana-url": {
"description": "IPv6 URL to access grafana",
"pattern": "^https://",
"format": "uri",
"type": "string"
},
"influxdb-url": {
"description": "IPv6 URL of influxdb HTTP endpoint",
"pattern": "^https://",
"format": "uri",
"type": "string"
},
"influxdb-database": {
......@@ -38,8 +37,23 @@
"description": "password for influxdb user",
"type": "string"
},
"telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"loki-url": {
"description": "Base URL of Loki",
"format": "uri",
"type": "string"
},
"loki-caucase-url": {
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string"
},
"agent-promtail-url": {
"description": "URL of embedded server from promtail",
"format": "uri",
"type": "string"
},
"agent-facl-script": {
"description": "Path of a generated script to set ACL for the agent to access files and sockets. This might be needed depending on how slapos partitions were formatted",
"type": "string"
}
},
......
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
eggs-directory = {{ buildout_eggs_directory }}
develop-eggs-directory = {{ buildout_develop_eggs_directory }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
influxdb-data-dir = ${:srv}/influxdb
grafana-dir = ${:srv}/grafana
grafana-data-dir = ${:grafana-dir}/data
grafana-logs-dir = ${:var}/log
grafana-plugins-dir = ${:grafana-dir}/plugins
grafana-provisioning-config-dir = ${:grafana-dir}/provisioning-config
grafana-provisioning-datasources-dir = ${:grafana-provisioning-config-dir}/datasources
grafana-provisioning-dashboards-dir = ${:grafana-provisioning-config-dir}/dashboards
grafana-dashboards-dir = ${:grafana-dir}/dashboards
loki-dir = ${:srv}/loki
loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-compactor-working-directory = ${:loki-dir}/compactor
srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
# macros
[generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
then
{{ openssl_bin }} req -x509 -nodes -sha256 -days 3650 \
-subj "/C=AA/ST=X/L=X/O=Dis/CN=${:common-name}" \
-newkey rsa -keyout ${:key-file} \
-out ${:cert-file}
fi
update-command = ${:command}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout_parts_directory }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb]
ipv6 = ${instance-parameter:ipv6-random}
ipv4 = ${instance-parameter:ipv4-random}
host = ${:ipv6}
local-host = ${:ipv4}
rpc-port = 8088
http-port = 8086
url = https://[${:host}]:${:http-port}
data-dir = ${directory:influxdb-data-dir}
auth-username = ${influxdb-password:username}
auth-password = ${influxdb-password:passwd}
unix-socket = ${directory:var}/influxdb.socket
ssl-cert-file = ${influxdb-certificate:cert-file}
ssl-key-file = ${influxdb-certificate:key-file}
database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
{{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
<= config-file
context =
section influxdb influxdb
[influxdb-password]
recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-insecure-self-signed-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb:ipv6}
port = ${influxdb:http-port}
[influxdb-password-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }} -username ${influxdb:auth-username} -password ${influxdb:auth-password} -socket ${influxdb:unix-socket} -execute "CREATE USER ${influxdb:auth-username} WITH PASSWORD '${influxdb:auth-password}' WITH ALL PRIVILEGES"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-database-ready-promise]
recipe = slapos.cookbook:wrapper
command-line =
bash -c "{{ influx_bin }} \
-username ${influxdb:auth-username} \
-password ${influxdb:auth-password} \
-host [${influxdb:host}] \
-port ${influxdb:http-port} \
-unsafeSsl \
-ssl \
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION {{ slapparameter_dict.get('influxdb', {}).get('default-retention-policy-days', 720) }}d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
port = 8180
url = https://[${:ipv6}]:${:port}
data-dir = ${directory:grafana-data-dir}
logs-dir = ${directory:grafana-logs-dir}
plugins-dir = ${directory:grafana-plugins-dir}
provisioning-config-dir = ${directory:grafana-provisioning-config-dir}
provisioning-datasources-dir = ${directory:grafana-provisioning-datasources-dir}
provisioning-dashboards-dir = ${directory:grafana-provisioning-dashboards-dir}
admin-user = ${grafana-password:username}
admin-password = ${grafana-password:passwd}
secret-key = ${grafana-secret-key:passwd}
ssl-key-file = ${grafana-certificate:key-file}
ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
hash-files =
${grafana-config-file:output}
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate]
<= generate-insecure-self-signed-certificate
[grafana-password]
recipe = slapos.cookbook:generate.password
username = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
[grafana-config-file]
<= config-file
context =
section grafana grafana
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file]
recipe = slapos.recipe.build
init =
# pre-create location, so that we can use hash-existing-files
import pathlib
datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": options['influxdb-database'],
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True # TODO
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 5000, # XXX see max_entries_limit_per_query in loki config
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file]
<= config-file
rendered = ${grafana:provisioning-dashboards-dir}/dashboard.yaml
context =
key dashboards_dir directory:grafana-dashboards-dir
[grafana-listen-promise]
<= check-port-listening-promise
hostname= ${grafana:ipv6}
port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[loki-server]
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
http-port = 3100
url = https://[${:ipv6}]:${:http-port}
ipv4 = ${instance-parameter:ipv4-random}
ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ loki_bin }} -config.file=${loki-server-config-file:location}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
# TODO: do we want this ? too large queries make the browser slow.
# default is 1000, but it seems we can at least raise to 5000
"max_entries_limit_per_query": 5001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-server-listen-promise]
<= check-url-available-promise
url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
cert-file = ${loki-promise-client-certificate:cert-file}
key-file = ${loki-promise-client-certificate:key-file}
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
{% if slapparameter_dict.get('caucase', {}).get('external-caucase-url') %}
[loki-caucased]
url = {{ slapparameter_dict.get('caucase', {}).get('external-caucase-url') }}
{% else %}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count is 4 for the default:
# - server: loki
# - clients: loki promise, grafana, promtail
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout_bin_directory,
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=4,
user_auto_approve_count='${loki-caucased-user-auto-approve-count:user-auto-approve-count}',
key_len=2048,
)}}
[loki-caucased-user-auto-approve-count]
user-auto-approve-count = {{ slapparameter_dict.get('caucase', {}).get('user-auto-approve-count', 0) }}
{% endif %}
[apache-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = Grafana Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
{% if slapparameter_dict.get('frontend', {}).get('custom-domain') %}
config-custom_domain = {{ slapparameter_dict['frontend']['custom-domain'] }}
{% endif %}
return = domain secure_access
[apache-frontend-available-promise]
<= check-url-available-promise
url = ${apache-frontend:connection-secure_access}
[request-agent-config]
recipe = slapos.recipe.build
init =
slap_connection = self.buildout["slap-connection"]
configuration = self.buildout['slap-configuration']['configuration']
applications = configuration.get('agent', {}).get('applications', [])
applications.append(
# Add a default config ingesting grafana's and influxdb's logs
{
"name": "Grafana",
"type": "system",
"partitions": [
{
"name": "grafana",
"static-tags": {
"partition_reference": slap_connection['partition-id'],
},
"log-file-patterns": [
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
]
},
{
"name": "influxdb",
"static-tags": {
"partition_reference": slap_connection['partition-id'],
},
"log-file-patterns": [
f"{self.buildout['directory']['home']}/.*_grafana*.log",
]
},
]
}
)
options['applications'] = applications
options['loki'] = {
'url': self.buildout['loki-server']['url'],
'caucase-url': self.buildout['loki-caucased']['url'],
}
options['influxdb'] = {
"url": self.buildout['influxdb']['url'],
"database": self.buildout['influxdb']['database'],
"username": self.buildout['influxdb']['auth-username'],
"password": self.buildout['influxdb']['auth-password'],
}
[request-slapos-partition-base]
recipe = slapos.cookbook:request.serialised
software-url = ${slap-connection:software-release-url}
server-url = ${slap-connection:server-url}
key-file = ${slap-connection:key-file}
cert-file = ${slap-connection:cert-file}
computer-id = ${slap-connection:computer-id}
partition-id = ${slap-connection:partition-id}
[request-agent]
<= request-slapos-partition-base
software-type = agent
name = agent
return = facl-script promtail-url
config-applications = ${request-agent-config:applications}
config-loki = ${request-agent-config:loki}
config-influxdb = ${request-agent-config:influxdb}
[agent-promtail-url]
recipe = slapos.cookbook:urlparse
url = ${request-agent:connection-promtail-url}
[agent-promtail-listen-promise]
<= check-port-listening-promise
hostname = ${agent-promtail-url:host}
port = ${agent-promtail-url:port}
[promises]
recipe =
instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path}
${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-server-listen-promise:path}
${apache-frontend-available-promise:path}
${agent-promtail-listen-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
influxdb-password = ${influxdb:auth-password}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
url = ${apache-frontend:connection-secure_access}
agent-facl-script = ${request-agent:connection-facl-script}
agent-promtail-url = ${request-agent:connection-promtail-url}
{
"$schema": "http://json-schema.org/draft-04/schema#",
"description": "Parameters to instantiate Grafana",
"type": "object",
"additionalProperties": false,
"properties": {
"smtp-server": {
"description": "SMTP server used by grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
"type": "string"
},
"smtp-username": {
"description": "Username to connect to SMTP server",
"type": "string"
},
"smtp-password": {
"description": "Password to connect to SMTP server",
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server",
"type": "string",
"enum": [
"true",
"false"
]
},
"email-from-address": {
"description": "Email address used in From: header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in From: header of emails",
"default": "Grafana",
"type": "string"
},
"promtail-extra-scrape-config": {
"description": "Raw promtail config (experimental parameter, see https://github.com/grafana/loki/blob/v0.3.0/docs/promtail.md#scrape-configs for detail)",
"default": "",
"type": "string"
}
}
}
[buildout]
parts =
promises
publish-connection-parameter
parts = switch-softwaretype
eggs-directory = {{ buildout['eggs-directory'] }}
develop-eggs-directory = {{ buildout['develop-eggs-directory'] }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[jinja2-template-base]
recipe = slapos.recipe.template:jinja2
filename = ${:_buildout_section_name_}.cfg
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:filename}
extensions =
jinja2.ext.do
extra-context =
context =
raw buildout_bin_directory {{ buildout['bin-directory'] }}
raw buildout_parts_directory {{ buildout['parts-directory'] }}
raw buildout_eggs_directory {{ buildout['eggs-directory'] }}
raw buildout_develop_eggs_directory {{ buildout['develop-eggs-directory'] }}
key slapparameter_dict slap-configuration:configuration
raw instance_default {{ instance_default }}
raw instance_agent {{ instance_agent }}
raw openssl_bin {{ openssl_bin }}
raw telegraf_bin {{ telegraf_bin }}
raw telegraf_input_slapos_bin {{ telegraf_input_slapos_bin }}
raw influxd_bin {{ influxd_bin }}
raw influx_bin {{ influx_bin }}
raw grafana_bin {{ grafana_bin }}
raw grafana_homepath {{ grafana_homepath }}
raw loki_bin {{ loki_bin }}
raw promtail_bin {{ promtail_bin }}
raw curl_bin {{ curl_bin }}
raw dash_bin {{ dash_bin }}
raw jq_bin {{ jq_bin }}
import-list =
file caucase context:caucase-jinja2-library
[context]
caucase-jinja2-library = {{ caucase_jinja2_library }}
[instance-default]
<= jinja2-template-base
url = {{ instance_default }}
[instance-agent]
<= jinja2-template-base
url = {{ instance_agent }}
[switch-softwaretype]
recipe = slapos.cookbook:switch-softwaretype
default = instance-default:output
RootSoftwareInstance = ${:default}
agent = instance-agent:output
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
influxdb-data-dir = ${:srv}/influxdb
grafana-dir = ${:srv}/grafana
grafana-data-dir = ${:grafana-dir}/data
grafana-logs-dir = ${:var}/log
grafana-plugins-dir = ${:grafana-dir}/plugins
grafana-provisioning-config-dir = ${:grafana-dir}/provisioning-config
grafana-provisioning-datasources-dir = ${:grafana-provisioning-config-dir}/datasources
grafana-provisioning-dashboards-dir = ${:grafana-provisioning-config-dir}/dashboards
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
loki-dir = ${:srv}/loki
loki-storage-boltdb-dir = ${:loki-dir}/index/
loki-storage-filesystem-dir = ${:loki-dir}/chunks/
promtail-dir = ${:srv}/promtail
# macros
[generate-certificate]
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
then
{{ openssl_bin }} req -x509 -nodes -sha256 -days 3650 \
-subj "/C=AA/ST=X/L=X/O=Dis/CN=${:common-name}" \
-newkey rsa -keyout ${:key-file} \
-out ${:cert-file}
fi
update-command = ${:command}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout['parts-directory'] }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb]
ipv6 = ${instance-parameter:ipv6-random}
ipv4 = ${instance-parameter:ipv4-random}
host = ${:ipv6}
local-host = ${:ipv4}
rpc-port = 8088
http-port = 8086
url = https://[${:host}]:${:http-port}
data-dir = ${directory:influxdb-data-dir}
auth-username = ${influxdb-password:username}
auth-password = ${influxdb-password:passwd}
unix-socket = ${directory:var}/influxdb.socket
ssl-cert-file = ${influxdb-certificate:cert-file}
ssl-key-file = ${influxdb-certificate:key-file}
database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
nice -19 chrt --idle 0 ionice -c3 {{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
<= config-file
context =
section influxdb influxdb
[influxdb-password]
recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb:ipv6}
port = ${influxdb:http-port}
[influxdb-password-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }} -username ${influxdb:auth-username} -password ${influxdb:auth-password} -socket ${influxdb:unix-socket} -execute "CREATE USER ${influxdb:auth-username} WITH PASSWORD '${influxdb:auth-password}' WITH ALL PRIVILEGES"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-database-ready-promise]
recipe = slapos.cookbook:wrapper
command-line =
bash -c "{{ influx_bin }} \
-username ${influxdb:auth-username} \
-password ${influxdb:auth-password} \
-host [${influxdb:host}] \
-port ${influxdb:http-port} \
-unsafeSsl \
-ssl \
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
port = 8180
url = https://[${:ipv6}]:${:port}
data-dir = ${directory:grafana-data-dir}
logs-dir = ${directory:grafana-logs-dir}
plugins-dir = ${directory:grafana-plugins-dir}
provisioning-config-dir = ${directory:grafana-provisioning-config-dir}
provisioning-datasources-dir = ${directory:grafana-provisioning-datasources-dir}
admin-user = ${grafana-password:username}
admin-password = ${grafana-password:passwd}
secret-key = ${grafana-secret-key:passwd}
ssl-key-file = ${grafana-certificate:key-file}
ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }} -config ${grafana-config-file:output} -homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
[grafana-certificate]
<= generate-certificate
[grafana-password]
recipe = slapos.cookbook:generate.password
username = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
[grafana-config-file]
<= config-file
context =
section grafana grafana
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-config-file:output}
[grafana-provisioning-config-file]
<= config-file
output = ${grafana:provisioning-datasources-dir}/datasource.yaml
context =
section influxdb influxdb
section loki loki
[grafana-listen-promise]
<= check-port-listening-promise
hostname= ${grafana:ipv6}
port = ${grafana:port}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
# telegraf needs influxdb to be already listening before starting
command-line =
bash -c '${influxdb-listen-promise:path} && nice -19 chrt --idle 0 ionice -c3 {{ telegraf_bin }} --config ${telegraf-config-file:output} --config-directory ${:extra-config-dir}'
wrapper-path = ${directory:service}/telegraf
[telegraf-config-file]
<= config-file
context =
section influxdb influxdb
section telegraf telegraf
[loki]
recipe = slapos.cookbook:wrapper
command-line =
bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ loki_bin }} -config.file=${loki-config-file:output}'
wrapper-path = ${directory:service}/loki
storage-boltdb-dir = ${directory:loki-storage-boltdb-dir}
storage-filesystem-dir = ${directory:loki-storage-filesystem-dir}
ip = ${instance-parameter:ipv4-random}
port = 3100
grpc-port = 9095
url = http://${:ip}:${:port}
[loki-config-file]
<= config-file
context =
section loki loki
[loki-listen-promise]
<= check-url-available-promise
url = ${loki:url}/ready
[promtail]
recipe = slapos.cookbook:wrapper
command-line =
bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ promtail_bin }} -config.file=${promtail-config-file:output}'
wrapper-path = ${directory:service}/promtail
dir = ${directory:promtail-dir}
http-port = 19080
grpc-port = 19095
ip = ${instance-parameter:ipv4-random}
url = http://${:ip}:${:http-port}
[promtail-config-file]
<= config-file
context =
section promtail promtail
section loki loki
key slapparameter_dict slap-configuration:configuration
[promtail-listen-promise]
<= check-port-listening-promise
hostname= ${promtail:ip}
port = ${promtail:http-port}
[apache-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = Grafana Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
config-https-only = true
return = domain secure_access
[apache-frontend-available-promise]
<= check-url-available-promise
url = ${apache-frontend:connection-secure_access}
[promises]
recipe =
instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${grafana-listen-promise:path}
${loki-listen-promise:path}
${promtail-listen-promise:path}
${promtail-listen-promise:path}
${apache-frontend-available-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
influxdb-password = ${influxdb:auth-password}
telegraf-extra-config-dir = ${telegraf:extra-config-dir}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki:url}
promtail-url = ${promtail:url}
url = ${apache-frontend:connection-secure_access}
auth_enabled: false
server:
http_listen_address: {{ loki['ip'] }}
http_listen_port: {{ loki['port'] }}
grpc_listen_address: {{ loki['ip'] }}
grpc_listen_port: {{ loki['grpc-port'] }}
ingester:
lifecycler:
address: {{ loki['ip'] }}
ring:
kvstore:
store: inmemory
replication_factor: 1
chunk_idle_period: 15m
schema_config:
configs:
- from: 2018-04-15
store: boltdb
object_store: filesystem
schema: v9
index:
prefix: index_
period: 168h
storage_config:
boltdb:
directory: {{ loki['storage-boltdb-dir'] }}
filesystem:
directory: {{ loki['storage-filesystem-dir'] }}
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0
table_manager:
chunk_tables_provisioning:
inactive_read_throughput: 0
inactive_write_throughput: 0
provisioned_read_throughput: 0
provisioned_write_throughput: 0
index_tables_provisioning:
inactive_read_throughput: 0
inactive_write_throughput: 0
provisioned_read_throughput: 0
provisioned_write_throughput: 0
retention_deletes_enabled: false
retention_period: 0
server:
http_listen_address: {{ promtail['ip'] }}
http_listen_port: {{ promtail['http-port'] }}
grpc_listen_address: {{ promtail['ip'] }}
grpc_listen_port: {{ promtail['grpc-port'] }}
external_url: {{ promtail['url'] }}
positions:
filename: {{ promtail['dir'] }}/positions.yaml
clients:
- url: {{ loki['url'] }}/api/prom/push
scrape_configs:
- job_name: test
static_configs:
- targets:
- localhost
labels:
job: grafanalogs
__path__: ./var/log/*log
{{ slapparameter_dict.get('promtail-extra-scrape-config', '') }}
[buildout]
extends =
../../stack/slapos.cfg
../../stack/caucase/buildout.cfg
../../stack/nodejs.cfg
../../component/make/buildout.cfg
../../component/golang/buildout.cfg
../../component/openssl/buildout.cfg
../../component/curl/buildout.cfg
../../component/dash/buildout.cfg
../../component/jq/buildout.cfg
../../component/systemd/buildout.cfg
../../component/fluent-bit/buildout.cfg
buildout.hash.cfg
versions = versions
parts =
slapos-cookbook
instance-profile
gowork
influxdb-config-file
telegraf-config-file
grafana-config-file
grafana-provisioning-config-file
loki-config-file
promtail-config-file
grafana-provisioning-dashboards-config-file
fluent-bit
post-install-cleanup
[nodejs]
<= nodejs-14.16.0
[go_github.com_grafana_grafana]
<= go-git-package
go.importpath = github.com/grafana/grafana
repository = https://github.com/grafana/grafana
revision = v7.5.2-0-gca413c612f
revision = v10.1.2-0-g8e428858dd
[go_github.com_grafana_loki]
<= go-git-package
go.importpath = github.com/grafana/loki
repository = https://github.com/perrinjerome/loki
revision = v2.2.1-1-gda6d45f2
repository = https://github.com/grafana/loki
revision = v3.1.0-0-g935aee77e
[go_github.com_influxdata_influxdb]
<= go-git-package
......@@ -46,49 +46,80 @@ revision = v1.8.4-0-gbc8ec4384e
<= go-git-package
go.importpath = github.com/influxdata/telegraf
repository = https://github.com/influxdata/telegraf
revision = v1.17.3-0-g24a552b90b
revision = v1.28.1-0-g3ea9ffbe2
[go_github.com_perrinjerome_slapos_telegraf_input]
<= go-git-package
go.importpath = github.com/perrinjerome/telegraf-input-slapos
repository = https://github.com/perrinjerome/telegraf-input-slapos
revision = v0.0.2-0-gd4c5221
[go_github.com_prometheus_prometheus]
<= go-git-package
go.importpath = github.com/prometheus/prometheus
repository = https://github.com/prometheus/prometheus
revision = v0.41.0-0-gc0d8a56c6
# [go_github.com_jaegertracking_jaeger]
# <= go-git-package
# go.importpath = github.com/jaegertracking/jaeger
# repository = https://github.com/jaegertracking/jaeger
# revision = v1.20.0-623-gcac21f82
[gowork]
# Fails with current default golang1.18
golang = ${golang1.17:location}
install =
${go_github.com_grafana_loki:location}:./cmd/loki
${go_github.com_grafana_loki:location}:./cmd/promtail
${go_github.com_grafana_loki:location}:./clients/cmd/promtail
${go_github.com_grafana_loki:location}:./cmd/logcli
${go_github.com_influxdata_telegraf:location}:./cmd/...
${go_github.com_influxdata_influxdb:location}:./cmd/...
${go_github.com_perrinjerome_slapos_telegraf_input:location}:./...
${go_github.com_prometheus_prometheus:location}:./cmd/...
# disable cgo, to prevent loki/promtail from using go-systemd
environment =
CGO_ENABLED = 0
CGO_ENABLED=1
CGO_CFLAGS=-I${systemd:location}/include
buildflags =
-tags promtail_journal_enabled
cpkgpath =
${systemd:location}
telegraf-bin = ${:bin}/telegraf
telegraf-input-slapos-bin = ${:bin}/telegraf-input-slapos
influx-bin = ${:bin}/influx
influxd-bin = ${:bin}/influxd
grafana-bin = ${:bin}/grafana-server
grafana-bin = ${grafana:binpath}/grafana
grafana-homepath = ${grafana:homepath}
loki-bin = ${:bin}/loki
promtail-bin = ${:bin}/promtail
[post-install-cleanup]
recipe = plone.recipe.command
stop-on-error = true
# remove caches and binary files confusing software check
command =
chmod +w ${gowork.dir:directory}/pkg/mod/github.com/gabriel-vasile/mimetype@v1.4.2/testdata/ \
&& rm -rf ${gowork.dir:directory}/pkg/mod/github.com/gabriel-vasile/mimetype@v1.4.2/testdata/so.so \
&& chmod -w ${gowork.dir:directory}/pkg/mod/github.com/gabriel-vasile/mimetype@v1.4.2/testdata/ \
&& rm -rf ${buildout:directory}/.cache/
[grafana]
recipe = plone.recipe.command
command = bash -c "
cd ${:homepath} &&
. ${gowork:env.sh} &&
# Unlike the loki, grafana _needs_ CGO_ENABLED, so we override here
export CGO_ENABLED=1 &&
command = bash -ce "
cd ${:homepath} && \
. ${gowork:env.sh} && \
go install github.com/google/wire/cmd/wire@v0.5.0 && \
wire gen -tags oss ./pkg/server ./pkg/cmd/grafana-cli/runner && \
go run build.go setup && \
go run build.go build && \
${yarn:location}/bin/yarn install --pure-lockfile && \
export NODE_OPTIONS=--max_old_space_size=8192 && \
${yarn:location}/bin/yarn install --immutable && \
${yarn:location}/bin/yarn run themes:generate && \
${yarn:location}/bin/yarn run build && \
${yarn:location}/bin/yarn run plugins:build-bundled && \
# Cleanup yarn and Cypress caches
rm -rf ${buildout:directory}/.cache/Cypress/ && \
rm -rf ${buildout:directory}/.cache/yarn/
"
${yarn:location}/bin/yarn run plugins:build-bundled"
homepath = ${go_github.com_grafana_grafana:location}
# XXX "linux-amd64" is not portable here
binpath = ${go_github.com_grafana_grafana:location}/bin/linux-amd64
stop-on-error = true
[download-file-base]
......@@ -98,19 +129,22 @@ url = ${:_profile_base_location_}/${:filename}
[influxdb-config-file]
<= download-file-base
[telegraf-config-file]
<= download-file-base
[grafana-config-file]
<= download-file-base
[grafana-provisioning-config-file]
[grafana-provisioning-dashboards-config-file]
<= download-file-base
[loki-config-file]
[instance-eggs]
recipe = zc.recipe.egg
eggs =
toml
[instance-agent]
<= download-file-base
[promtail-config-file]
[instance-default]
<= download-file-base
[instance-profile]
......@@ -120,8 +154,11 @@ output = ${buildout:directory}/instance.cfg
extensions = jinja2.ext.do
context =
section buildout buildout
key instance_default instance-default:target
key instance_agent instance-agent:target
key openssl_bin openssl-output:openssl
key telegraf_bin gowork:telegraf-bin
key telegraf_input_slapos_bin gowork:telegraf-input-slapos-bin
key influxd_bin gowork:influxd-bin
key influx_bin gowork:influx-bin
key grafana_bin gowork:grafana-bin
......@@ -130,8 +167,13 @@ context =
key promtail_bin gowork:promtail-bin
key curl_bin :curl-bin
key dash_bin :dash-bin
key jq_bin :jq-bin
key caucase_jinja2_library caucase-jinja2-library:target
curl-bin = ${curl:location}/bin/curl
dash-bin = ${dash:location}/bin/dash
jq-bin = ${jq:location}/bin/jq
depends = ${instance-eggs:eggs} ${caucase-eggs:eggs}
[versions]
inotifyx = 0.2.2
toml = 0.10.2
{
"name": "Grafana",
"description": "Grafana, Telegraf and Influxdb",
"serialisation": "xml",
"description": "Grafana, Influxdb, Loki and Telegraf",
"serialisation": "json-in-xml",
"software-type": {
"default": {
"title": "Default",
"description": "Grafana, Telegraf and Influxdb in same partition",
"request": "instance-input-schema.json",
"response": "instance-output-schema.json",
"description": "Grafana, Influxdb and Loki",
"request": "instance-default-input-schema.json",
"response": "instance-default-output-schema.json",
"index": 0
},
"agent": {
"title": "Agent",
"description": "Telegraf agent sending metrics to Influxdb and Promtail agent sending logs to Loki",
"request": "instance-agent-input-schema.json",
"response": "instance-agent-output-schema.json",
"index": 0
}
}
......
# Telegraf configuration
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared plugins.
# Even if a plugin has no configuration, it must be declared in here
# to be active. Declaring a plugin means just specifying the name
# as a section with no variables. To deactivate a plugin, comment
# out the name and any variables.
# Use 'telegraf -config telegraf.toml -test' to see what metrics a config
# file would generate.
# One rule that plugins conform to is wherever a connection string
# can be passed, the values '' and 'localhost' are treated specially.
# They indicate to the plugin to use their own builtin configuration to
# connect to the local system.
# NOTE: The configuration has a few required parameters. They are marked
# with 'required'. Be sure to edit those to make this configuration work.
# Tags can also be specified via a normal map, but only one form at a time:
[tags]
# dc = "us-east-1"
# Configuration for telegraf agent
[agent]
# Default data collection interval for all plugins
interval = "10s"
# Rounds collection interval to 'interval'
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
# Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s"
# Jitter the flush interval by a random amount. This is primarily to avoid
# large write spikes for users running a large number of telegraf instances.
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"
# Run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
###############################################################################
# OUTPUTS #
###############################################################################
[outputs]
# Configuration for influxdb server to send metrics to
[outputs.influxdb]
# The full HTTP or UDP endpoint URL for your InfluxDB instance
# Multiple urls can be specified for InfluxDB cluster support.
# urls = ["udp://localhost:8089"] # UDP endpoint example
# XXX XXX XXX
#urls = ["http://localhost:8086"] # required
urls = ["{{ influxdb['url'] }}"]
insecure_skip_verify = true # because we are using a self signed certificate
# The target database for metrics (telegraf will create it if not exists)
database = "{{ influxdb['database'] }}" # required
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
username = "{{ influxdb['auth-username'] }}"
password = "{{ influxdb['auth-password'] }}"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
###############################################################################
# PLUGINS #
###############################################################################
# Read metrics about cpu usage
[cpu]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
drop = ["cpu_time"]
# Read metrics about memory usage
[mem]
# no configuration
[disk]
[io]
[system]
###############################################################################
# ERP5 - PLUGINS #
###############################################################################
#
# Left here as example, don't edit this file directly, but place your config
# files in {{ telegraf['extra-config-dir'] }}
#
#[mysql]
# servers = ["root@unix(/srv/slapgrid/slappart12/srv/runner/instance/slappart1/var/run/mariadb.sock)/erp5"]
#[memcached]
# # XXX kumofs does not support memcached's stat command
# servers = ["10.0.248.233:2013", "10.0.248.233:2003"]
#[haproxy]
# servers = ["http://10.0.121.162:2150/haproxy", "http://10.0.121.162:2152/haproxy"]
#[[inputs.exec]]
# commands = ["/srv/slapgrid/slappart0/bin/slapsensor /srv/slapgrid/slappart0/srv/runner/instance/etc/supervisord.conf"]
# name_suffix = "_slapos"
# interval = "5s"
###############################################################################
# SERVICE PLUGINS #
###############################################################################
......@@ -25,13 +25,15 @@
#
##############################################################################
from __future__ import unicode_literals
import functools
import io
import json
import logging
import os
import pathlib
import re
import tempfile
import textwrap
import time
import urllib.parse
import psutil
import requests
......@@ -39,10 +41,8 @@ from six.moves import configparser
from slapos.testing.testcase import makeModuleSetUpAndTestCaseClass
setUpModule, SlapOSInstanceTestCase = makeModuleSetUpAndTestCaseClass(
os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', 'software.cfg')))
pathlib.Path(__file__).parent.parent / 'software.cfg')
class GrafanaTestCase(SlapOSInstanceTestCase):
......@@ -57,78 +57,97 @@ class GrafanaTestCase(SlapOSInstanceTestCase):
class TestGrafana(GrafanaTestCase):
def setUp(self):
self.grafana_url = self.computer_partition.getConnectionParameterDict(
)['grafana-url']
self.connection_params = json.loads(
self.computer_partition.getConnectionParameterDict()['_']
)
self.grafana_url = self.connection_params['grafana-url']
def test_grafana_available(self):
resp = requests.get(self.grafana_url, verify=False)
self.assertEqual(requests.codes.ok, resp.status_code)
self.assertEqual(resp.status_code, requests.codes.ok)
def test_grafana_api(self):
# check API is usable
api_org_url = '{self.grafana_url}/api/org'.format(**locals())
api_org_url = f'{self.grafana_url}/api/org'
resp = requests.get(api_org_url, verify=False)
self.assertEqual(requests.codes.unauthorized, resp.status_code)
self.assertEqual(resp.status_code, requests.codes.unauthorized)
connection_params = self.computer_partition.getConnectionParameterDict()
resp = requests.get(
api_org_url,
verify=False,
auth=requests.auth.HTTPBasicAuth(
connection_params['grafana-username'],
connection_params['grafana-password'],
self.connection_params['grafana-username'],
self.connection_params['grafana-password'],
))
self.assertEqual(requests.codes.ok, resp.status_code)
self.assertEqual(1, resp.json()['id'])
self.assertEqual(resp.status_code, requests.codes.ok)
self.assertEqual(resp.json()['id'], 1)
def test_grafana_datasource_povisinonned(self):
def test_grafana_datasource_provisioned(self):
# data sources are provisionned
connection_params = self.computer_partition.getConnectionParameterDict()
resp = requests.get(
'{self.grafana_url}/api/datasources'.format(**locals()),
get = functools.partial(
requests.get,
verify=False,
auth=requests.auth.HTTPBasicAuth(
connection_params['grafana-username'],
connection_params['grafana-password'],
))
self.assertEqual(requests.codes.ok, resp.status_code)
self.connection_params['grafana-username'],
self.connection_params['grafana-password'],
)
)
datasources_resp = get(f'{self.grafana_url}/api/datasources')
self.assertEqual(datasources_resp.status_code, requests.codes.ok)
self.assertEqual(
sorted(['influxdb', 'loki']),
sorted([ds['type'] for ds in resp.json()]))
sorted([ds['type'] for ds in datasources_resp.json()]),
sorted(['influxdb', 'loki']))
# data sources are usable
# for this we need to wait a bit, because they are only usable once
# some data has been ingested
influxdb, = [ds for ds in datasources_resp.json() if ds['type'] == 'influxdb']
loki, = [ds for ds in datasources_resp.json() if ds['type'] == 'loki']
for retry in range(16):
influxdb_health = get(f'{self.grafana_url}/api/datasources/uid/{influxdb["uid"]}/health').json()
if influxdb_health.get('status') == "OK":
break
time.sleep(retry)
self.assertEqual(influxdb_health['status'], "OK")
for retry in range(16):
loki_health = get(f'{self.grafana_url}/api/datasources/uid/{loki["uid"]}/resources/labels?start={time.time() - 1000}').json()
if loki_health.get('data'):
break
time.sleep(retry)
else:
self.fail(loki_health)
self.assertEqual(loki_health['status'], "success")
self.assertIn("app", loki_health['data'])
def test_email_disabled(self):
config = configparser.ConfigParser()
# grafana config file is like an ini file with an implicit default section
with open(
os.path.join(self.computer_partition_root_path, 'etc',
'grafana-config-file.cfg')) as f:
config.readfp(io.StringIO('[default]\n' + f.read()))
f = self.computer_partition_root_path / 'etc' / 'grafana-config-file.cfg'
config.read_file(io.StringIO('[default]\n' + f.read_text()))
self.assertEqual(config.get('smtp', 'enabled'), 'false')
class TestGrafanaEmailEnabled(GrafanaTestCase):
__partition_reference__ = 'mail'
smtp_verify_ssl = "true"
smtp_verify_ssl = True
smtp_skip_verify = "false"
@classmethod
def getInstanceParameterDict(cls):
return {
return {"_": json.dumps({
"email": {
"smtp-server": "smtp.example.com:25",
"smtp-username": "smtp_username",
"smtp-password": "smtp_password",
'smtp-verify-ssl': cls.smtp_verify_ssl,
"email-from-address": "grafana@example.com",
"email-from-name": "Grafana From Name",
}
}})}
def test_email_enabled(self):
config = configparser.ConfigParser()
with open(
os.path.join(self.computer_partition_root_path, 'etc',
'grafana-config-file.cfg')) as f:
config.readfp(io.StringIO('[default]\n' + f.read()))
f = self.computer_partition_root_path / 'etc' / 'grafana-config-file.cfg'
config.read_file(io.StringIO('[default]\n' + f.read_text()))
self.assertEqual(config.get('smtp', 'enabled'), 'true')
self.assertEqual(config.get('smtp', 'host'), 'smtp.example.com:25')
self.assertEqual(config.get('smtp', 'user'), 'smtp_username')
......@@ -139,98 +158,169 @@ class TestGrafanaEmailEnabled(GrafanaTestCase):
class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled):
smtp_verify_ssl = "false"
smtp_verify_ssl = False
smtp_skip_verify = "true"
class TestInfluxDb(GrafanaTestCase):
def setUp(self):
self.influxdb_url = self.computer_partition.getConnectionParameterDict(
)['influxdb-url']
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
def test_influxdb_available(self):
ping_url = '{self.influxdb_url}/ping'.format(**locals())
ping_url = f'{self.influxdb_url}/ping'
resp = requests.get(ping_url, verify=False)
self.assertEqual(requests.codes.no_content, resp.status_code)
self.assertEqual(resp.status_code, requests.codes.no_content)
def test_influxdb_api(self):
query_url = '{self.influxdb_url}/query'.format(**locals())
connection_params = self.computer_partition.getConnectionParameterDict()
query_url = f'{self.influxdb_url}/query'
for i in range(10):
for i in range(16):
# retry, as it may take a little delay to create databases
resp = requests.get(
query_url,
verify=False,
params=dict(
q='SHOW DATABASES',
u=connection_params['influxdb-username'],
p=connection_params['influxdb-password']))
self.assertEqual(requests.codes.ok, resp.status_code)
u=self.connection_params['influxdb-username'],
p=self.connection_params['influxdb-password']))
self.assertEqual(resp.status_code, requests.codes.ok)
result, = resp.json()['results']
if result['series'] and 'values' in result['series'][0]:
break
time.sleep(0.5 * i)
self.assertIn(
[connection_params['influxdb-database']], result['series'][0]['values'])
[self.connection_params['influxdb-database']], result['series'][0]['values'])
class TestTelegraf(GrafanaTestCase):
__partition_reference__ = 'G'
@classmethod
def getInstanceParameterDict(cls):
parameter_dict = {
"agent": {
"applications": [
{
"name": "slapos-standalone-from-test",
"type": "SlapOS",
"instance-root": cls.slap._instance_root,
"partitions": [
{
"name": "test grafana - default partition",
"type": "default",
"reference": "G0", # XXX assumes partitions will be allocated in order
},
{
"name": "test grafana - agent partition",
"type": "default",
"reference": "G1"
},
],
},
],
},
}
return {'_': json.dumps(parameter_dict)}
def setUp(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
def test_telegraf_running(self):
with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo()
process_info, = [p for p in all_process_info if 'telegraf' in p['name']]
self.assertEqual('RUNNING', process_info['statename'])
self.assertEqual(process_info['statename'], 'RUNNING')
def test_telegraf_ingest_slapos_metrics(self):
# wait for data to be ingested
time.sleep(16)
query_url = f'{self.influxdb_url}/query'
query = """
SELECT max("state")
FROM "slapos_services"
WHERE time >= now() - 5m and time <= now()
GROUP BY time(5m),
"partition_reference"::tag,
"name"::tag,
"computer_id"::tag,
"process_name"::tag
fill(null)
"""
get = functools.partial(
requests.get,
verify=False,
params=dict(
q=query,
db=self.connection_params['influxdb-database'],
u=self.connection_params['influxdb-username'],
p=self.connection_params['influxdb-password'],
),
)
for i in range(16):
resp = get(query_url)
if resp.ok and resp.json()['results'][0].get('series'):
break
time.sleep(i)
else:
self.fail(resp.text)
series = resp.json()['results'][0].get('series')
# hashes and "-on-watch" is removed from process_name
self.assertIn('grafana', [s['tags']['process_name'] for s in series])
self.assertIn('telegraf', [s['tags']['process_name'] for s in series])
self.assertIn('loki-service', [s['tags']['process_name'] for s in series])
self.assertIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series])
tags = [s['tags'] for s in series if s['tags']['partition_reference'] == 'G0'][0]
self.assertEqual(tags['name'], 'test grafana - default partition')
self.assertEqual(tags['computer_id'], self.slap._computer_id)
self.assertEqual(tags['partition_reference'], 'G0')
self.assertEqual(
{s['tags']['partition_reference'] for s in series},
{'G0', 'G1'},
)
class TestLoki(GrafanaTestCase):
@classmethod
def getInstanceParameterDict(cls):
cls._logfile = tempfile.NamedTemporaryFile(suffix='log')
return {
'promtail-extra-scrape-config':
textwrap.dedent(
r'''
- job_name: {cls.__name__}
pipeline_stages:
- match:
selector: '{{job="{cls.__name__}"}}'
stages:
- multiline:
firstline: '^\d{{4}}-\d{{2}}-\d{{2}}\s\d{{1,2}}\:\d{{2}}\:\d{{2}}\,\d{{3}}'
max_wait_time: 3s
- regex:
expression: '^(?P<timestamp>.*) - (?P<name>\S+) - (?P<level>\S+) - (?P<message>.*)'
- timestamp:
format: 2006-01-02T15:04:05Z00:00
source: timestamp
- labels:
level:
name:
static_configs:
- targets:
- localhost
labels:
job: {cls.__name__}
__path__: {cls._logfile.name}
''').format(**locals())
cls.addClassCleanup(cls._logfile.close)
parameter_dict = {
"agent": {
"applications": [
{
"name": "TestLoki",
"type": "system",
"partitions": [
{
"name": "test log file",
"log-file-patterns": [cls._logfile.name],
"static-tags": {
"testtag": "foo",
},
},
],
},
],
},
}
@classmethod
def tearDownClass(cls):
cls._logfile.close()
super(TestLoki, cls).tearDownClass()
return {'_': json.dumps(parameter_dict)}
def setUp(self):
self.loki_url = self.computer_partition.getConnectionParameterDict(
self.loki_url = json.loads(
self.computer_partition.getConnectionParameterDict()['_']
)['loki-url']
def test_loki_available(self):
self.assertEqual(
requests.codes.ok,
requests.get('{self.loki_url}/ready'.format(**locals()),
verify=False).status_code)
def test_loki_certificate_required(self):
with self.assertRaisesRegex(requests.exceptions.SSLError, 'certificate required'):
requests.get(f'{self.loki_url}/ready', verify=False)
def test_log_ingested(self):
# create a logger logging to the file that we have
......@@ -239,68 +329,47 @@ class TestLoki(GrafanaTestCase):
test_logger.propagate = False
test_logger.setLevel(logging.INFO)
test_handler = logging.FileHandler(filename=self._logfile.name)
test_handler.setFormatter(
logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
test_logger.addHandler(test_handler)
test_logger.info("testing message")
test_logger.info("testing another message")
test_logger.warning("testing warn")
# log an exception, which will be multi line in log file.
def nested1():
def nested2():
raise ValueError('boom')
nested2()
try:
nested1()
except ValueError:
test_logger.exception("testing exception")
# Check our messages have been ingested
# we retry a few times, because there's a short delay until messages are
# ingested and returned.
for i in range(60):
resp = requests.get(
'{self.loki_url}/api/prom/query?query={{job="TestLoki"}}'.format(
**locals()),
verify=False).json()
if len(resp.get('streams', [])) < 3:
time.sleep(0.5 * i)
continue
warn_stream_list = [stream for stream in resp['streams'] if 'level="WARNING"' in stream['labels']]
self.assertEqual(1, len(warn_stream_list), resp['streams'])
warn_stream, = warn_stream_list
self.assertIn("testing warn", warn_stream['entries'][0]['line'])
info_stream_list = [stream for stream in resp['streams'] if 'level="INFO"' in stream['labels']]
self.assertEqual(1, len(info_stream_list), resp['streams'])
info_stream, = info_stream_list
self.assertTrue(
[
line for line in info_stream['entries']
if "testing message" in line['line']
])
self.assertTrue(
[
line for line in info_stream['entries']
if "testing another message" in line['line']
])
error_stream_list = [stream for stream in resp['streams'] if 'level="ERROR"' in stream['labels']]
self.assertEqual(1, len(error_stream_list), resp['streams'])
error_stream, = error_stream_list
line, = [line['line'] for line in error_stream['entries']]
# this entry is multi-line
self.assertIn('testing exception\nTraceback (most recent call last):\n', line)
self.assertIn('ValueError: boom', line)
# The labels we have configued are also available
resp = requests.get(
'{self.loki_url}/api/prom/label'.format(**locals()),
verify=False).json()
self.assertIn('level', resp['values'])
self.assertIn('name', resp['values'])
test_logger.info("testing info message")
get = functools.partial(
requests.get,
cert=(
self.computer_partition_root_path / 'etc' / 'loki-promise-client-certificate.crt',
self.computer_partition_root_path / 'etc' / 'loki-promise-client-certificate.key',
),
verify=self.computer_partition_root_path / 'etc' / 'loki-server-certificate.ca.crt',
)
url = urllib.parse.urlparse(
self.loki_url
)._replace(
path="/loki/api/v1/query_range",
query=urllib.parse.urlencode({'query': '{app="TestLoki"} |= ""'}),
).geturl()
for i in range(16):
resp = get(url)
if resp.ok:
if result := resp.json().get('data', {}).get('result', []):
break
time.sleep(i)
else:
self.fail(resp.text)
self.assertEqual(
result[0]['stream'],
{
'app': 'TestLoki',
'computer_id': self.slap._computer_id,
'detected_level': 'info',
'filename': self._logfile.name,
'job': 'TestLoki-test log file',
'name': 'test log file',
'service_name': 'TestLoki',
'testtag': 'foo',
}
)
self.assertEqual(
[v[1] for v in result[0]['values']],
['testing info message'])
self.assertEqual(len(result), 1)
class TestListenInPartition(GrafanaTestCase):
......@@ -308,8 +377,17 @@ class TestListenInPartition(GrafanaTestCase):
with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo()
def canonical_process_name(process):
"""remove hash from hash-files and "on-watch"
"""
return re.sub(
r'-([a-f0-9]{32})$',
'',
process['name'].replace('-on-watch', ''),
)
self.process_dict = {
p['name'].replace('-on-watch', ''): psutil.Process(p['pid'])
canonical_process_name(p): psutil.Process(p['pid'])
for p in all_process_info if p['name'] != 'watchdog'
}
......@@ -328,13 +406,13 @@ class TestListenInPartition(GrafanaTestCase):
c.laddr for c in self.process_dict['influxdb'].connections()
if c.status == 'LISTEN'
]),
[
sorted([
(self._ipv4_address, 8088),
(self.computer_partition_ipv6_address, 8086),
],
]),
)
def test_telegraph_listen(self):
def test_telegraf_listen(self):
self.assertEqual(
[
c.laddr for c in self.process_dict['telegraf'].connections()
......@@ -346,13 +424,13 @@ class TestListenInPartition(GrafanaTestCase):
def test_loki_listen(self):
self.assertEqual(
sorted([
c.laddr for c in self.process_dict['loki'].connections()
c.laddr for c in self.process_dict['loki-service'].connections()
if c.status == 'LISTEN'
]),
[
(self._ipv4_address, 3100),
sorted([
(self._ipv4_address, 9095),
],
(self.computer_partition_ipv6_address, 3100),
]),
)
def test_promtail_listen(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment