Commit 92bdae55 authored by Romain Courteaud's avatar Romain Courteaud

Add various improvements: pack, warning, speed, url suffix, missing data

See merge request !12
parents a25dd45a 8ff446bc
......@@ -49,6 +49,9 @@ setup(
"dev": ["pytest", "black", "pyflakes", "mock", "httpretty"]
},
entry_points={
"console_scripts": ["surykatka=surykatka.cli:runSurykatka "]
"console_scripts": [
"surykatka=surykatka.cli:runSurykatka ",
"surykatkastat=surykatka.cli:runStats ",
]
},
)
......@@ -26,17 +26,29 @@ from .dns import (
expandDomainList,
getDomainIpDict,
reportDnsQuery,
packDns,
)
from .http import getRootUrl, getUrlHostname, checkHttpStatus, reportHttp
from .network import isTcpPortOpen, reportNetwork
from .http import (
getRootUrl,
getUrlHostname,
checkHttpStatus,
reportHttp,
packHttp,
)
from .network import isTcpPortOpen, reportNetwork, packNetwork
import json
import email.utils
from collections import OrderedDict
from .ssl import hasValidSSLCertificate, reportSslCertificate
from .ssl import (
hasValidSSLCertificate,
reportSslCertificate,
packSslCertificate,
)
import datetime
from email.utils import parsedate_to_datetime
__version__ = "0.5.0"
__version__ = "0.6.0"
class BotError(Exception):
......@@ -47,6 +59,94 @@ def rfc822(date):
return email.utils.format_datetime(date)
def filterWarningStatus(status_dict, interval, not_critical_url_list):
now = datetime.datetime.utcnow()
if interval < 60:
interval = 60
for i in range(len(status_dict["bot_status"]) - 1, -1, -1):
status_date = parsedate_to_datetime(
status_dict["bot_status"][i]["date"]
)
if (now - status_date).total_seconds() < (2 * interval):
# Skip the bot status if it was recently triggerer
del status_dict["bot_status"][i]
if not status_dict["bot_status"]:
del status_dict["bot_status"]
for i in range(len(status_dict["dns_server"]) - 1, -1, -1):
state = status_dict["dns_server"][i]["state"]
if state == "open":
del status_dict["dns_server"][i]
if not status_dict["dns_server"]:
del status_dict["dns_server"]
for i in range(len(status_dict["dns_query"]) - 1, -1, -1):
state = status_dict["dns_query"][i]["response"]
if state != "":
del status_dict["dns_query"][i]
if not status_dict["dns_query"]:
del status_dict["dns_query"]
if not status_dict["missing_data"]:
del status_dict["missing_data"]
for i in range(len(status_dict["http_server"]) - 1, -1, -1):
state = status_dict["http_server"][i]["state"]
# Skip if all domains lead to not critical urls
prefix = ""
if status_dict["http_server"][i]["port"] == 80:
prefix = "http://"
elif status_dict["http_server"][i]["port"] == 443:
prefix = "https://"
domain_list = status_dict["http_server"][i]["domain"].split(", ")
domain_list = [
x
for x in domain_list
if "%s%s" % (prefix, x) not in not_critical_url_list
]
if (state == "open") or (not domain_list):
del status_dict["http_server"][i]
if not status_dict["http_server"]:
del status_dict["http_server"]
for i in range(len(status_dict["ssl_certificate"]) - 1, -1, -1):
not_after = status_dict["ssl_certificate"][i]["not_after"]
if (
(not_after is not None)
and (
(60 * 60 * 24 * 14)
< (parsedate_to_datetime(not_after) - now).total_seconds()
)
) or (
("https://%s" % status_dict["ssl_certificate"][i]["hostname"])
in not_critical_url_list
):
# Warn 2 weeks before expiration
# Skip if we check only the http url
del status_dict["ssl_certificate"][i]
else:
# Drop columns with too much info
del status_dict["ssl_certificate"][i]["not_before"]
del status_dict["ssl_certificate"][i]["issuer"]
del status_dict["ssl_certificate"][i]["sha1_fingerprint"]
del status_dict["ssl_certificate"][i]["subject"]
if not status_dict["ssl_certificate"]:
del status_dict["ssl_certificate"]
for i in range(len(status_dict["http_query"]) - 1, -1, -1):
http_code = status_dict["http_query"][i]["status_code"]
if (http_code != 404) and (http_code < 500):
del status_dict["http_query"][i]
elif status_dict["http_query"][i]["url"] in not_critical_url_list:
del status_dict["http_query"][i]
else:
# Drop columns with too much info
del status_dict["http_query"][i]["http_header_dict"]
del status_dict["http_query"][i]["total_seconds"]
if not status_dict["http_query"]:
del status_dict["http_query"]
class WebBot:
def __init__(self, **kw):
self.config_kw = kw
......@@ -81,6 +181,22 @@ class WebBot:
public_suffix_list=self.config["PUBLIC_SUFFIX"].split(),
)
def calculateNotCriticalUrlList(self):
domain_list = self.config["DOMAIN"].split()
url_list = self.config["URL"].split()
not_critical_url_list = []
for url in url_list:
hostname = getUrlHostname(url)
if hostname is not None:
if hostname not in domain_list:
# Domain not explicitely checked
# Skip both root url
for protocol in ("http", "https"):
not_critical_url = "%s://%s" % (protocol, hostname)
if not_critical_url not in url_list:
not_critical_url_list.append(not_critical_url)
return not_critical_url_list
def iterateLoop(self):
status_id = logStatus(self._db, "loop")
......@@ -347,12 +463,20 @@ class WebBot:
self._running = True
try:
while self._running:
previous_time = datetime.datetime.utcnow()
self.iterateLoop()
next_time = datetime.datetime.utcnow()
interval = int(self.config.get("INTERVAL"))
if interval < 0:
self.stop()
else:
time.sleep(interval)
time.sleep(
max(
0,
interval
- (next_time - previous_time).total_seconds(),
)
)
except KeyboardInterrupt:
self.stop()
except:
......@@ -360,22 +484,36 @@ class WebBot:
logStatus(self._db, "error")
raise
def pack(self):
logStatus(self._db, "packing")
packDns(self._db)
packHttp(self._db)
packNetwork(self._db)
packSslCertificate(self._db)
self._db.vacuum()
logStatus(self._db, "packed")
def run(self, mode):
status_dict = None
if mode not in ["crawl", "status"]:
if mode not in ["crawl", "pack", "status", "warning"]:
raise NotImplementedError("Unexpected mode: %s" % mode)
if self.config["SQLITE"] == ":memory:":
# Crawl/report are mandatory when using memory
if mode == "warning":
mode = "wallwarning"
else:
mode = "all"
self.initDB()
try:
if mode in ["crawl", "all"]:
if mode in ["crawl", "wallwarning", "all"]:
self.crawl()
if mode in ["status", "all"]:
if mode in ["status", "all", "wallwarning", "warning"]:
status_dict = self.status()
if mode == "pack":
self.pack()
except:
self.closeDB()
raise
......@@ -383,6 +521,12 @@ class WebBot:
self.closeDB()
if status_dict is not None:
if mode in ("wallwarning", "warning"):
filterWarningStatus(
status_dict,
int(self.config.get("INTERVAL")),
self.calculateNotCriticalUrlList(),
)
if self.config["FORMAT"] == "json":
print(json.dumps(status_dict))
else:
......
......@@ -29,7 +29,7 @@ from .bot import create_bot
help="The bot operation mode to run.",
show_default=True,
default="status",
type=click.Choice(["crawl", "status"]),
type=click.Choice(["crawl", "pack", "status", "warning"]),
)
@click.option(
"--sqlite", "-s", help="The path of the sqlite DB. (default: :memory:)"
......@@ -55,6 +55,9 @@ from .bot import create_bot
default="plain",
show_default=True,
)
@click.option(
"--profile", help="Profiler data path", type=click.Path(exists=False)
)
def runSurykatka(
run,
sqlite,
......@@ -65,6 +68,7 @@ def runSurykatka(
configuration,
reload,
output,
profile,
):
mapping = {}
......@@ -85,7 +89,28 @@ def runSurykatka(
mapping["RELOAD"] = str(reload)
mapping["FORMAT"] = output
bot = create_bot(cfgfile=configuration, mapping=mapping)
if profile is None:
return bot.run(run)
else:
import cProfile
return cProfile.runctx(
"bot.run(run)", globals(), locals(), filename=profile
)
@click.command(short_help="Stats profiler bot data.")
@click.option("--stats", type=click.Choice(["cumul", "time"]))
@click.argument("profile", type=click.Path(exists=True, dir_okay=False))
def runStats(stats, profile):
click.echo("Profile bot execution")
import pstats
profile_stats = pstats.Stats(profile)
if stats == "time":
profile_stats.sort_stats("time", "calls").print_stats(30)
else:
profile_stats.sort_stats("cumulative").print_stats(30)
if __name__ == "__main__":
......
......@@ -26,7 +26,15 @@ from playhouse.migrate import migrate, SqliteMigrator
class LogDB:
def __init__(self, sqlite_path):
self._db = SqliteExtDatabase(
sqlite_path, pragmas=(("journal_mode", "WAL"), ("foreign_keys", 1))
sqlite_path,
pragmas=(
("journal_mode", "WAL"),
("foreign_keys", 1),
("wal_autocheckpoint", 5),
("temp_store", "MEMORY"),
("synchronous", "NORMAL"),
("mmap_size", 30000000000),
),
)
self._db.connect()
......@@ -228,3 +236,6 @@ class LogDB:
def close(self):
self._db.close()
def vacuum(self):
self._db.execute_sql("VACUUM", [])
......@@ -52,6 +52,18 @@ def reportDnsQuery(db, resolver_ip=None, domain=None, rdtype=None):
return query
def packDns(db):
with db._db.atomic():
result = [x for x in reportDnsQuery(db)]
for dns_change in result:
db.DnsChange.delete().where(
db.DnsChange.status_id != dns_change.status_id,
db.DnsChange.resolver_ip == dns_change.resolver_ip,
db.DnsChange.domain == dns_change.domain,
db.DnsChange.rdtype == dns_change.rdtype,
).execute()
def logDnsQuery(db, status_id, resolver_ip, domain_text, rdtype, answer_list):
answer_list.sort()
response = ", ".join(answer_list)
......
......@@ -74,7 +74,10 @@ def request(url, timeout=TIMEOUT, headers=None, session=requests, version=0):
except requests.exceptions.ConnectionError:
response = requests.models.Response()
response.status_code = 523
except requests.exceptions.Timeout:
except (
requests.exceptions.Timeout,
requests.exceptions.ChunkedEncodingError,
):
response = requests.models.Response()
response.status_code = 524
except requests.exceptions.TooManyRedirects:
......@@ -107,6 +110,17 @@ def reportHttp(db, ip=None, url=None):
return query
def packHttp(db):
with db._db.atomic():
result = [x for x in reportHttp(db)]
for http_change in result:
db.HttpCodeChange.delete().where(
db.HttpCodeChange.status_id != http_change.status_id,
db.HttpCodeChange.url == http_change.url,
db.HttpCodeChange.ip == http_change.ip,
).execute()
def calculateSpeedRange(total_seconds, fast, moderate):
# Prevent updating the DB by defining acceptable speed range
if total_seconds == 0:
......
......@@ -56,6 +56,18 @@ def reportNetwork(db, ip=None, transport=None, port=None):
return query
def packNetwork(db):
with db._db.atomic():
result = [x for x in reportNetwork(db)]
for network_change in result:
db.NetworkChange.delete().where(
db.NetworkChange.status_id != network_change.status_id,
db.NetworkChange.transport == network_change.transport,
db.NetworkChange.port == network_change.port,
db.NetworkChange.ip == network_change.ip,
).execute()
def logNetwork(db, ip, transport, port, state, status_id):
with db._db.atomic():
......
......@@ -53,6 +53,18 @@ def reportSslCertificate(db, ip=None, port=None, hostname=None):
return query
def packSslCertificate(db):
with db._db.atomic():
result = [x for x in reportSslCertificate(db)]
for ssl_change in result:
db.SslChange.delete().where(
db.SslChange.status_id != ssl_change.status_id,
db.SslChange.hostname == ssl_change.hostname,
db.SslChange.port == ssl_change.port,
db.SslChange.ip == ssl_change.ip,
).execute()
def logSslCertificate(
db,
ip,
......
......@@ -679,6 +679,7 @@ class SurykatkaBotStatusTestCase(unittest.TestCase):
def suite():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(SurykatkaBotTestCase))
suite.addTest(unittest.makeSuite(SurykatkaBotStatusTestCase))
return suite
......
......@@ -23,6 +23,7 @@ import peewee
import surykatka.dns
from surykatka.dns import (
expandDomainList,
packDns,
logDnsQuery,
buildResolver,
queryDNS,
......@@ -586,6 +587,78 @@ class SurykatkaDNSTestCase(unittest.TestCase):
assert self.db.DnsChange.select().count() == 2
assert self.db.NetworkChange.select().count() == 0
################################################
# packDns
################################################
def test_packDns_oldLog(self):
domain = "http://example.org"
resolver_ip = "127.0.0.1"
rdtype = "foo"
answer_list = ["4.3.2.1", "1.2.3.4"]
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip, domain, rdtype, answer_list
)
answer_list_2 = ["4.3.2.1", "1.2.3.4", "0.0.0.0"]
status_id_2 = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id_2, resolver_ip, domain, rdtype, answer_list_2
)
result = packDns(self.db)
assert self.db.DnsChange.select().count() == 1
assert self.db.DnsChange.get().resolver_ip == resolver_ip
assert self.db.DnsChange.get().domain == domain
assert self.db.DnsChange.get().rdtype == rdtype
assert self.db.DnsChange.get().response == "0.0.0.0, 1.2.3.4, 4.3.2.1"
assert self.db.DnsChange.get().status_id == status_id_2
assert result == None
def test_packDns_keepDifferentUrl(self):
domain = "http://example.org"
domain_2 = domain + "."
resolver_ip = "127.0.0.1"
resolver_ip_2 = resolver_ip + "1"
rdtype = "foo"
rdtype_2 = rdtype + "bar"
answer_list = ["4.3.2.1", "1.2.3.4"]
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip, domain, rdtype, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip_2, domain, rdtype, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip, domain_2, rdtype, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip, domain, rdtype_2, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip_2, domain_2, rdtype, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip_2, domain, rdtype_2, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip, domain_2, rdtype_2, answer_list
)
status_id = logStatus(self.db, "foo")
logDnsQuery(
self.db, status_id, resolver_ip_2, domain_2, rdtype_2, answer_list
)
result = packDns(self.db)
assert self.db.DnsChange.select().count() == 8
assert result == None
def suite():
suite = unittest.TestSuite()
......
......@@ -27,6 +27,7 @@ from surykatka.http import (
request,
logHttpStatus,
checkHttpStatus,
packHttp,
)
from surykatka.status import logStatus
import httpretty
......@@ -151,6 +152,19 @@ class SurykatkaHttpTestCase(unittest.TestCase):
assert mock_request.call_count == 1
assert response.status_code == 524, response.status_code
def test_request_ChunkedEncodingError(self):
url_to_proxy = "http://example.org/"
httpretty.register_uri(httpretty.GET, url_to_proxy)
with mock.patch("surykatka.http.requests.request") as mock_request:
def sideEffect(*args, **kw):
raise surykatka.http.requests.exceptions.ChunkedEncodingError()
mock_request.side_effect = sideEffect
response = request(url_to_proxy)
assert mock_request.call_count == 1
assert response.status_code == 524, response.status_code
def test_request_tooManyRedirect(self):
url_to_proxy = "http://example.org/"
httpretty.register_uri(httpretty.GET, url_to_proxy)
......@@ -1037,6 +1051,113 @@ class SurykatkaHttpTestCase(unittest.TestCase):
)
assert self.db.HttpCodeChange.get().status_id == status_id
################################################
# packHttp
################################################
def test_packHttp_dropOldLog(self):
ip = "127.0.0.1"
url = "http://example.org"
status_code = 200
http_header_dict = {"a": "b"}
total_seconds = 0.1
fast = 0.2
moderate = 0.5
status_code_2 = status_code + 1
status_id = logStatus(self.db, "foo")
logHttpStatus(
self.db,
ip,
url,
status_code,
http_header_dict,
total_seconds,
fast,
moderate,
status_id,
)
status_id_2 = logStatus(self.db, "foo")
logHttpStatus(
self.db,
ip,
url,
status_code_2,
http_header_dict,
total_seconds,
fast,
moderate,
status_id_2,
)
result = packHttp(self.db)
assert self.db.HttpCodeChange.select().count() == 1
assert self.db.HttpCodeChange.get().ip == ip
assert self.db.HttpCodeChange.get().url == url
assert self.db.HttpCodeChange.get().status_code == status_code_2
assert self.db.HttpCodeChange.get().status_id == status_id_2
assert result == None
def test_packHttp_keepDifferentUrl(self):
ip = "127.0.0.1"
ip_2 = ip + "2"
url = "http://example.org"
url_2 = url + "2"
total_seconds = 0.1
status_code = 200
http_header_dict = {"a": "b"}
fast = 0.2
moderate = 0.5
status_id = logStatus(self.db, "foo")
logHttpStatus(
self.db,
ip,
url,
status_code,
http_header_dict,
total_seconds,
fast,
moderate,
status_id,
)
status_id = logStatus(self.db, "foo")
logHttpStatus(
self.db,
ip_2,
url,
status_code,
http_header_dict,
total_seconds,
fast,
moderate,
status_id,
)
status_id = logStatus(self.db, "foo")
logHttpStatus(
self.db,
ip,
url_2,
status_code,
http_header_dict,
total_seconds,
fast,
moderate,
status_id,
)
status_id = logStatus(self.db, "foo")
logHttpStatus(
self.db,
ip_2,
url_2,
status_code,
http_header_dict,
total_seconds,
fast,
moderate,
status_id,
)
packHttp(self.db)
assert self.db.HttpCodeChange.select().count() == 4
def suite():
suite = unittest.TestSuite()
......
......@@ -20,7 +20,7 @@
import unittest
from surykatka.db import LogDB
import surykatka.network
from surykatka.network import logNetwork, isTcpPortOpen
from surykatka.network import logNetwork, isTcpPortOpen, packNetwork
from surykatka.status import logStatus
import mock
import peewee
......@@ -422,6 +422,59 @@ class SurykatkaNetworkTestCase(unittest.TestCase):
assert mock_socket.return_value.close.call_count == 1
################################################
# packNetwork
################################################
def test_packNetwork_oldLog(self):
ip = "127.0.0.1"
port = 1234
transport = "foobar"
state = "bar"
state_2 = "bar2"
status_id = logStatus(self.db, "foo")
status_id_2 = logStatus(self.db, "foo")
logNetwork(self.db, ip, transport, port, state, status_id)
logNetwork(self.db, ip, transport, port, state_2, status_id_2)
result = packNetwork(self.db)
assert self.db.NetworkChange.select().count() == 1
assert self.db.NetworkChange.get().ip == ip
assert self.db.NetworkChange.get().port == port
assert self.db.NetworkChange.get().transport == transport
assert self.db.NetworkChange.get().state == state_2
assert self.db.NetworkChange.get().status_id == status_id_2
assert result == None
def test_packNetwork_keepDifferentUrl(self):
ip = "127.0.0.1"
ip_2 = ip + "2"
port = 1234
port_2 = port + 1
transport = "foobar"
transport_2 = transport + "."
state = "bar"
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip, transport, port, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip_2, transport, port, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip, transport_2, port, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip, transport, port_2, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip_2, transport_2, port, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip_2, transport, port_2, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip, transport_2, port_2, state, status_id)
status_id = logStatus(self.db, "foo")
logNetwork(self.db, ip_2, transport_2, port_2, state, status_id)
result = packNetwork(self.db)
assert self.db.NetworkChange.select().count() == 8
assert result == None
def suite():
suite = unittest.TestSuite()
......
......@@ -20,7 +20,11 @@
import unittest
from surykatka.db import LogDB
import surykatka.ssl
from surykatka.ssl import logSslCertificate, hasValidSSLCertificate
from surykatka.ssl import (
logSslCertificate,
hasValidSSLCertificate,
packSslCertificate,
)
from surykatka.status import logStatus
import mock
import peewee
......@@ -719,6 +723,91 @@ class SurykatkaSslTestCase(unittest.TestCase):
== 0
)
################################################
# packSslCertificate
################################################
def test_packSslCertificate_differentState(self):
ip = "127.0.0.1"
port = 1234
hostname = "example.org"
sha1_fingerprint = "asdfghj"
not_before = datetime.datetime.utcnow()
not_after = datetime.datetime.utcnow()
subject = "foosubject"
issuer = "barissuer"
status_id = logStatus(self.db, "foo")
logSslCertificate(
self.db,
ip,
port,
hostname,
sha1_fingerprint,
not_before,
not_after,
subject,
issuer,
status_id,
)
status_id_2 = logStatus(self.db, "foo")
sha1_fingerprint_2 = sha1_fingerprint + "."
logSslCertificate(
self.db,
ip,
port,
hostname,
sha1_fingerprint_2,
not_before,
not_after,
subject,
issuer,
status_id_2,
)
result = packSslCertificate(self.db)
assert self.db.SslChange.select().count() == 1
assert self.db.SslChange.get().ip == ip
assert self.db.SslChange.get().port == port
assert self.db.SslChange.get().hostname == hostname
assert self.db.SslChange.get().sha1_fingerprint == sha1_fingerprint_2
assert self.db.SslChange.get().status_id == status_id_2
assert result == None
def test_packSslCertificate_keepDifferentKeys(self):
ip = "127.0.0.1"
ip_2 = ip + "2"
port = 1234
port_2 = port + 1
hostname = "example.org"
hostname_2 = hostname + "."
status_id = logStatus(self.db, "foo")
sha1_fingerprint = "asdfghj"
not_before = datetime.datetime.utcnow()
not_after = datetime.datetime.utcnow()
subject = "foosubject"
issuer = "barissuer"
args = [
sha1_fingerprint,
not_before,
not_after,
subject,
issuer,
status_id,
]
logSslCertificate(self.db, ip, port, hostname, *args)
logSslCertificate(self.db, ip_2, port, hostname, *args)
logSslCertificate(self.db, ip, port_2, hostname, *args)
logSslCertificate(self.db, ip, port, hostname_2, *args)
logSslCertificate(self.db, ip_2, port_2, hostname, *args)
logSslCertificate(self.db, ip_2, port, hostname_2, *args)
logSslCertificate(self.db, ip, port_2, hostname_2, *args)
logSslCertificate(self.db, ip_2, port_2, hostname_2, *args)
assert self.db.SslChange.select().count() == 8
def suite():
suite = unittest.TestSuite()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment