Commit 676f357a authored by Marek Vavruša's avatar Marek Vavruša Committed by 4ast

src/lua: LuaJIT BPF compiler, examples, tests (#652)

this is initial commit of LuaJIT bytecode to BPF
compiler project that enables writing both kernel
and user-part of the code as Lua
parent 07175d05
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- This example program measures latency of block device operations and plots it
-- in a histogram. It is similar to BPF example:
-- https://github.com/torvalds/linux/blob/master/samples/bpf/tracex3_kern.c
local ffi = require('ffi')
local bpf = require('bpf')
local S = require('syscall')
-- Shared part of the program
local bins = 100
local map = bpf.map('hash', 512, ffi.typeof('uint64_t'), ffi.typeof('uint64_t'))
local lat_map = bpf.map('array', bins)
-- Kernel-space part of the program
local trace_start = bpf.kprobe('myprobe:blk_start_request', function (ptregs)
map[ptregs.parm1] = time()
end, false, -1, 0)
local trace_end = bpf.kprobe('myprobe2:blk_account_io_completion', function (ptregs)
-- The lines below are computing index
-- using log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
-- index = 29 ~ 1 usec
-- index = 59 ~ 1 msec
-- index = 89 ~ 1 sec
-- index = 99 ~ 10sec or more
local delta = time() - map[ptregs.parm1]
local index = 3 * math.log2(delta)
if index >= bins then
index = bins-1
end
xadd(lat_map[index], 1)
return true
end, false, -1, 0)
-- User-space part of the program
pcall(function()
local counter = 0
local sym = {' ',' ','.','.','*','*','o','o','O','O','#','#'}
while true do
-- Print header once in a while
if counter % 50 == 0 then
print('|1us |10us |100us |1ms |10ms |100ms |1s |10s')
counter = 0
end
counter = counter + 1
-- Collect all events
local hist, events = {}, 0
for i=29,bins-1 do
local v = tonumber(lat_map[i] or 0)
if v > 0 then
hist[i] = hist[i] or 0 + v
events = events + v
end
end
-- Print histogram symbols based on relative frequency
local s = ''
for i=29,bins-1 do
if hist[i] then
local c = math.ceil((hist[i] / (events + 1)) * #sym)
s = s .. sym[c]
else s = s .. ' ' end
end
print(s .. string.format(' ; %d events', events))
S.sleep(1)
end
end)
\ No newline at end of file
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Simple tracing example that executes a program on
-- return from sys_write() and tracks the number of hits
local ffi = require('ffi')
local bpf = require('bpf')
local S = require('syscall')
-- Shared part of the program
local map = bpf.map('array', 1)
-- Kernel-space part of the program
local probe = bpf.kprobe('myprobe:sys_write', function (ptregs)
xadd(map[0], 1)
end, true)
-- User-space part of the program
pcall(function()
for _ = 1, 10 do
print('hits: ', tonumber(map[0]))
S.sleep(1)
end
end)
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Simple parsing example of UDP/DNS that counts frequency of QTYPEs.
-- It shows how to parse packet variable-length packet structures.
local ffi = require("ffi")
local bpf = require("bpf")
local S = require("syscall")
-- Shared part of the program
local map = assert(bpf.map('array', 256))
-- Kernel-space part of the program
local prog = bpf.socket('lo', function (skb)
local ip = pkt.ip -- Accept only UDP messages
if ip.proto ~= c.ip.proto_udp then return false end
local udp = ip.udp -- Only messages >12 octets (DNS header)
if udp.length < 12 then return false end
-- Unroll QNAME (up to 2 labels)
udp = udp.data + 12
local label = udp[0]
if label > 0 then
udp = udp + label + 1
label = udp[0]
if label > 0 then
udp = udp + label + 1
end
end
-- Track QTYPE (low types)
if udp[0] == 0 then
local qtype = udp[2] -- Low octet from QTYPE
xadd(map[qtype], 1)
end
end)
-- User-space part of the program
for _ = 1, 10 do
for k,v in map.pairs,map,0 do
v = tonumber(v)
if v > 0 then
print(string.format('TYPE%d: %d', k, v))
end
end
S.sleep(1)
end
\ No newline at end of file
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Simple parsing example of TCP/HTTP that counts frequency of types of requests
-- and shows more complicated pattern matching constructions and slices.
-- Rewrite of a BCC example:
-- https://github.com/iovisor/bcc/blob/master/examples/networking/http_filter/http-parse-simple.c
local ffi = require("ffi")
local bpf = require("bpf")
local S = require("syscall")
-- Shared part of the program
local map = bpf.map('hash', 64)
-- Kernel-space part of the program
local prog = bpf.socket('lo', function (skb)
-- Only ingress so we don't count twice on loopback
if skb.ingress_ifindex == 0 then return end
local data = pkt.ip.tcp.data -- Get TCP protocol dissector
-- Continue only if we have 7 bytes of TCP data
if data + 7 > skb.len then return end
-- Fetch 4 bytes of TCP data and compare
local h = data(0, 4)
if h == 'HTTP' or h == 'GET ' or
h == 'POST' or h == 'PUT ' or
h == 'HEAD' or h == 'DELE' then
-- If hash key doesn't exist, create it
-- otherwise increment counter
local v = map[h]
if not v then map[h] = 1
else xadd(map[h], 1)
end
end
end)
-- User-space part of the program
for _ = 1, 10 do
local strkey = ffi.new('uint32_t [1]')
local s = ''
for k,v in map.pairs,map,0 do
strkey[0] = bpf.ntoh(k)
s = s..string.format('%s %d ', ffi.string(strkey, 4):match '^%s*(.-)%s*$', tonumber(v))
end
if #s > 0 then print(s..'messages') end
S.sleep(1)
end
\ No newline at end of file
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- This program looks at IP, UDP and ICMP packets and
-- increments counter for each packet of given type seen
-- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c
local ffi = require("ffi")
local bpf = require("bpf")
local S = require("syscall")
-- Shared part of the program
local map = bpf.map('hash', 256)
map[1], map[6], map[17] = 0, 0, 0
-- Kernel-space part of the program
bpf.socket('lo', function (skb)
local proto = pkt.ip.proto -- Get byte (ip.proto) from frame at [23]
xadd(map[proto], 1) -- Atomic `map[proto] += 1`
end)
-- User-space part of the program
for _ = 1, 10 do
local icmp, udp, tcp = map[1], map[17], map[6]
print(string.format('TCP %d UDP %d ICMP %d packets',
tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0)))
S.sleep(1)
end
\ No newline at end of file
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- This program counts total bytes received per-protocol in 64-bit counters.
-- The map backend is array in this case to avoid key allocations.
-- increments counter for each packet of given type seen
-- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c
local ffi = require("ffi")
local bpf = require("bpf")
local S = require("syscall")
-- Shared part of the program
local map = bpf.map('array', 256, ffi.typeof('uint32_t'), ffi.typeof('uint64_t'))
-- Kernel-space part of the program
bpf.socket('lo', function (skb)
local proto = pkt.ip.proto -- Get byte (ip.proto) from frame at [23]
xadd(map[proto], skb.len) -- Atomic `map[proto] += <payload length>`
end)
-- User-space part of the program
for _ = 1, 10 do
local icmp, udp, tcp = map[1], map[17], map[6]
print(string.format('TCP %d UDP %d ICMP %d bytes',
tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0)))
S.sleep(1)
end
\ No newline at end of file
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Summarize off-CPU time by stack trace
-- Related tool: https://github.com/iovisor/bcc/blob/master/tools/offcputime.py
local ffi = require('ffi')
local bpf = require('bpf')
local S = require('syscall')
-- Create BPF maps
-- TODO: made smaller to fit default memory limits
local key_t = 'struct { char name[16]; int32_t stack_id; }'
local starts = assert(bpf.map('hash', 128, ffi.typeof('uint32_t'), ffi.typeof('uint64_t')))
local counts = assert(bpf.map('hash', 128, ffi.typeof(key_t), ffi.typeof('uint64_t')))
local stack_traces = assert(bpf.map('stack_trace', 16))
-- Open tracepoint and attach BPF program
-- The 'arg' parses tracepoint format automatically
local tp = bpf.tracepoint('sched/sched_switch', function (arg)
-- Update previous thread sleep time
local pid = arg.prev_pid
local now = time()
starts[pid] = now
-- Calculate current thread's delta time
pid = arg.next_pid
local from = starts[pid]
if not from then
return 0
end
local delta = (now - from) / 1000
starts[pid] = nil
-- Check if the delta is below 1us
if delta < 1 then
return
end
-- Create key for this thread
local key = ffi.new(key_t)
comm(key.name)
key.stack_id = stack_id(stack_traces, BPF.F_FAST_STACK_CMP)
-- Update current thread off cpu time with delta
local val = counts[key]
if not val then
counts[key] = 0
end
xadd(counts[key], delta)
end, 0, -1)
-- Helper: load kernel symbols
ffi.cdef 'unsigned long long strtoull(const char *, char **, int);'
local ksyms = {}
for l in io.lines('/proc/kallsyms') do
local addr, sym = l:match '(%w+) %w (%S+)'
if addr then ksyms[ffi.C.strtoull(addr, nil, 16)] = sym end
end
-- User-space part of the program
while true do
for k,v in counts.pairs,counts,nil do
local s = ''
local traces = stack_traces[k.stack_id]
if traces then
for i, ip in ipairs(traces) do
s = s .. string.format(" %-16p %s", ip, ksyms[ip])
end
end
s = s .. string.format(" %-16s %s", "-", ffi.string(k.name))
s = s .. string.format(" %d", tonumber(v))
print(s)
end
S.sleep(1)
end
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Trace readline() call from all bash instances (print bash commands from all running shells).
-- This is rough equivallent to `bashreadline` with output through perf event API.
-- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
local ffi = require('ffi')
local bpf = require('bpf')
local S = require('syscall')
-- Perf event map
local sample_t = 'struct { uint64_t pid; char str[80]; }'
local events = bpf.map('perf_event_array')
-- Kernel-space part of the program
local probe = bpf.uprobe('/bin/bash:readline', function (ptregs)
local sample = ffi.new(sample_t)
sample.pid = pid_tgid()
ffi.copy(sample.str, ffi.cast('char *', ptregs.ax)) -- Cast `ax` to string pointer and copy to buffer
perf_submit(events, sample) -- Write buffer to perf event map
end, true, -1, 0)
-- User-space part of the program
local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe
print(' TASK-PID TIMESTAMP FUNCTION')
print(' | | | |')
while true do
log:block() -- Wait until event reader is readable
for _,e in log:read() do -- Collect available reader events
print(string.format('%12s%-16s %-10s %s', '', tonumber(e.pid), os.date("%H:%M:%S"), ffi.string(e.str)))
end
end
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Trace readline() call from all bash instances (print bash commands from all running shells).
-- This is rough equivallent to `bashreadline`
-- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
local ffi = require('ffi')
local bpf = require('bpf')
local S = require('syscall')
-- Kernel-space part of the program
local probe = bpf.uprobe('/bin/bash:readline', function (ptregs)
local line = ffi.new('char [40]') -- Create a 40 byte buffer on stack
ffi.copy(line, ffi.cast('char *', ptregs.ax)) -- Cast `ax` to string pointer and copy to buffer
print('%s\n', line) -- Print to trace_pipe
end, true, -1, 0)
-- User-space part of the program
local ok, err = pcall(function()
local log = bpf.tracelog()
print(' TASK-PID CPU# TIMESTAMP FUNCTION')
print(' | | | | |')
while true do
print(log:read())
end
end)
#!/usr/bin/env bcc-lua
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- Trace operations on keys matching given pattern in KyotoTycoon daemon.
-- This can show you if certain keys were modified or read during the lifetime
-- even if KT doesn't support this. It also shows how to attach to C++ mangled symbols.
local ffi = require('ffi')
local bpf = require('bpf')
local S = require('syscall')
local function help(err)
print(string.format('%s [get|set] [key]', arg[0]))
if err then print('error: '..err) end
os.exit(1)
end
-- Accept the same format as ktremotemgr for clarity: <get|set> <key>
local writeable, watch_key, klen = 'any', arg[2] or '*', 80
if arg[1] == 'get' then writeable = 0
elseif arg[1] == 'set' then writeable = 1
elseif arg[1] == '-h' or arg[1] == '--help' then help()
elseif arg[1] and arg[1] ~= 'any' then
help(string.format('bad cmd: "%s"', arg[1]))
end
if watch_key ~= '*' then klen = #watch_key end
-- Find a good entrypoint that has both key and differentiates read/write in KT
-- That is going to serve as an attachment point for BPF program
-- ABI: bool accept(void *this, const char* kbuf, size_t ksiz, Visitor* visitor, bool writable)
local key_type = string.format('char [%d]', klen)
local probe = bpf.uprobe('/usr/local/bin/ktserver:kyotocabinet::StashDB::accept',
function (ptregs)
-- Watch either get/set or both
if writeable ~= 'any' then
if ptregs.parm5 ~= writeable then return end
end
local line = ffi.new(key_type)
ffi.copy(line, ffi.cast('char *', ptregs.parm2))
-- Check if we're looking for specific key
if watch_key ~= '*' then
if ptregs.parm3 ~= klen then return false end
if line ~= watch_key then return false end
end
print('%s write:%d\n', line, ptregs.parm5)
end, false, -1, 0)
-- User-space part of the program
local ok, err = pcall(function()
local log = bpf.tracelog()
print(' TASK-PID CPU# TIMESTAMP FUNCTION')
print(' | | | | |')
while true do
print(log:read())
end
end)
......@@ -4,7 +4,8 @@ find_program(LUAJIT luajit)
if (LUAJIT_LIBRARIES AND LUAJIT)
FILE(GLOB_RECURSE SRC_LUA
${CMAKE_CURRENT_SOURCE_DIR}/bcc/*.lua
${CMAKE_CURRENT_SOURCE_DIR}/bcc/vendor/*.lua)
${CMAKE_CURRENT_SOURCE_DIR}/bcc/vendor/*.lua
${CMAKE_CURRENT_SOURCE_DIR}/bpf/*.lua)
ADD_CUSTOM_COMMAND(
OUTPUT bcc.lua
......
Lua Tools for BCC
-----------------
This directory contains Lua tooling for [BCC](https://github.com/iovisor/bcc)
This directory contains Lua tooling for [BCC][bcc]
(the BPF Compiler Collection).
BCC is a toolkit for creating userspace and kernel tracing programs. By
......@@ -52,3 +52,104 @@ The following instructions assume Ubuntu 14.04 LTS.
```
sudo ./bcc-probe examples/lua/task_switch.lua
```
## LuaJIT BPF compiler
Now it is also possible to write Lua functions and compile them transparently to BPF bytecode, here is a simple socket filter example:
```lua
local S = require('syscall')
local bpf = require('bpf')
local map = bpf.map('array', 256)
-- Kernel-space part of the program
local prog = assert(bpf(function ()
local proto = pkt.ip.proto -- Get byte (ip.proto) from frame at [23]
xadd(map[proto], 1) -- Increment packet count
end))
-- User-space part of the program
local sock = assert(bpf.socket('lo', prog))
for i=1,10 do
local icmp, udp, tcp = map[1], map[17], map[6]
print('TCP', tcp, 'UDP', udp, 'ICMP', icmp, 'packets')
S.sleep(1)
end
```
The other application of BPF programs is attaching to probes for [perf event tracing][tracing]. That means you can trace events inside the kernel (or user-space), and then collect results - for example histogram of `sendto()` latency, off-cpu time stack traces, syscall latency, and so on. While kernel probes and perf events have unstable ABI, with a dynamic language we can create and use proper type based on the tracepoint ABI on runtime.
Runtime automatically recognizes reads that needs a helper to be accessed. The type casts denote source of the objects, for example the [bashreadline][bashreadline] example that prints entered bash commands from all running shells:
```lua
local ffi = require('ffi')
local bpf = require('bpf')
-- Perf event map
local sample_t = 'struct { uint64_t pid; char str[80]; }'
local events = bpf.map('perf_event_array')
-- Kernel-space part of the program
bpf.uprobe('/bin/bash:readline' function (ptregs)
local sample = ffi.new(sample_t)
sample.pid = pid_tgid()
ffi.copy(sample.str, ffi.cast('char *', req.ax)) -- Cast `ax` to string pointer and copy to buffer
perf_submit(events, sample) -- Write sample to perf event map
end, true, -1, 0)
-- User-space part of the program
local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe
while true do
log:block() -- Wait until event reader is readable
for _,e in log:read() do -- Collect available reader events
print(tonumber(e.pid), ffi.string(e.str))
end
end
```
Where cast to `struct pt_regs` flags the source of data as probe arguments, which means any pointer derived
from this structure points to kernel and a helper is needed to access it. Casting `req.ax` to pointer is then required for `ffi.copy` semantics, otherwise it would be treated as `u64` and only it's value would be
copied. The type detection is automatic most of the times (socket filters and `bpf.tracepoint`), but not with uprobes and kprobes.
### Installation
```bash
$ luarocks install bpf
```
### Examples
See `examples/lua` directory.
### Helpers
* `print(...)` is a wrapper for `bpf_trace_printk`, the output is captured in `cat /sys/kernel/debug/tracing/trace_pipe`
* `bit.*` library **is** supported (`lshift, rshift, arshift, bnot, band, bor, bxor`)
* `math.*` library *partially* supported (`log2, log, log10`)
* `ffi.cast()` is implemented (including structures and arrays)
* `ffi.new(...)` allocates memory on stack, initializers are NYI
* `ffi.copy(...)` copies memory (possibly using helpers) between stack/kernel/registers
* `ntoh(x[, width])` - convert from network to host byte order.
* `hton(x[, width])` - convert from host to network byte order.
* `xadd(dst, inc)` - exclusive add, a synchronous `*dst += b` if Lua had `+=` operator
Below is a list of BPF-specific helpers:
* `time()` - return current monotonic time in nanoseconds (uses `bpf_ktime_get_ns`)
* `cpu()` - return current CPU number (uses `bpf_get_smp_processor_id`)
* `pid_tgid()` - return caller `tgid << 32 | pid` (uses `bpf_get_current_pid_tgid`)
* `uid_gid()` - return caller `gid << 32 | uid` (uses `bpf_get_current_uid_gid`)
* `comm(var)` - write current process name (uses `bpf_get_current_comm`)
* `perf_submit(map, var)` - submit variable to perf event array BPF map
* `stack_id(map, flags)` - return stack trace identifier from stack trace BPF map
### Current state
* Not all LuaJIT bytecode opcodes are supported *(notable mentions below)*
* Closures `UCLO` will probably never be supported, although you can use upvalues inside compiled function.
* Type narrowing is opportunistic. Numbers are 64-bit by default, but 64-bit immediate loads are not supported (e.g. `local x = map[ffi.cast('uint64_t', 1000)]`)
* Tail calls `CALLT`, and iterators `ITERI` are NYI (as of now)
* Arbitrary ctype **is** supported both for map keys and values
* Basic optimisations like: constant propagation, partial DCE, liveness analysis and speculative register allocation are implement, but there's no control flow analysis yet. This means the compiler has the visibility when things are used and dead-stores occur, but there's no rewriter pass to eliminate them.
* No register sub-allocations, no aggressive use of caller-saved `R1-5`, no aggressive narrowing (this would require variable range assertions and variable relationships)
* Slices with not 1/2/4/8 length are NYI (requires allocating a memory on stack and using pointer type)
[bcc]: https://github.com/iovisor/bcc
[tracing]: http://www.brendangregg.com/blog/2016-03-05/linux-bpf-superpowers.html
[bashreadline]: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
\ No newline at end of file
......@@ -15,21 +15,25 @@ limitations under the License.
]]
local ffi = require("ffi")
ffi.cdef[[
typedef int clockid_t;
typedef long time_t;
struct timespec {
time_t tv_sec;
long tv_nsec;
};
int clock_gettime(clockid_t clk_id, struct timespec *tp);
int clock_nanosleep(clockid_t clock_id, int flags,
const struct timespec *request, struct timespec *remain);
-- Avoid duplicate declarations if syscall library is present
local has_syscall, _ = pcall(require, "syscall")
if not has_syscall then
ffi.cdef [[
typedef int clockid_t;
typedef long time_t;
struct timespec {
time_t tv_sec;
long tv_nsec;
};
int clock_gettime(clockid_t clk_id, struct timespec *tp);
int clock_nanosleep(clockid_t clock_id, int flags,
const struct timespec *request, struct timespec *remain);
]]
end
ffi.cdef [[
int get_nprocs(void);
uint64_t strtoull(const char *nptr, char **endptr, int base);
]]
......
package = "bpf"
version = "scm-1"
source = {
url = "git://github.com/iovisor/bcc.git"
}
description = {
summary = "BCC - LuaJIT to BPF compiler.",
detailed = [[
]],
homepage = "https://github.com/iovisor/bcc",
license = "BSD"
}
dependencies = {
"lua >= 5.1",
"ljsyscall >= 0.12",
}
external_dependencies = {
LIBELF = {
library = "elf"
}
}
build = {
type = "builtin",
install = {
bin = {
}
},
modules = {
bpf = "src/lua/bpf/bpf.lua",
["bpf.builtins"] = "src/lua/bpf/builtins.lua",
["bpf.cdef"] = "src/lua/bpf/cdef.lua",
["bpf.elf"] = "src/lua/bpf/elf.lua",
["bpf.init"] = "src/lua/bpf/init.lua",
["bpf.ljbytecode"] = "src/lua/bpf/ljbytecode.lua",
["bpf.proto"] = "src/lua/bpf/proto.lua",
}
}
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- LuaJIT to BPF bytecode compiler.
--
-- The code generation phase is currently one-pass and produces:
-- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt)
-- * Variables with liveness analysis and other meta (spill information, compile-time value)
--
-- The code generator optimises as much as possible in single pass:
-- * Fold compile-time expressions and constant propagation
-- * Basic control flow analysis with dead code elimination (based on compile-time expressions)
-- * Single-pass optimistic register allocation
--
-- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further
-- optimisations such as:
-- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used)
-- * Common sub-expression elimination (relies on DCE and liveness analysis)
-- * Orphan JMP elimination (removing this in first pass would break previous JMP targets)
-- * Better register allocation (needs to be recomputed after optimisations)
local ffi = require('ffi')
local bit = require('bit')
local S = require('syscall')
local bytecode = require('bpf.ljbytecode')
local cdef = require('bpf.cdef')
local proto = require('bpf.proto')
local builtins = require('bpf.builtins')
-- Constants
local ALWAYS, NEVER = -1, -2
local BPF = ffi.typeof('struct bpf')
local HELPER = ffi.typeof('struct bpf_func_id')
-- Symbolic table of constant expressions over numbers
local const_expr = {
ADD = function (a, b) return a + b end,
SUB = function (a, b) return a - b end,
DIV = function (a, b) return a / b end,
MOD = function (a, b) return a % b end,
JEQ = function (a, b) return a == b end,
JNE = function (a, b) return a ~= b end,
JGE = function (a, b) return a >= b end,
JGT = function (a, b) return a > b end,
}
local const_width = {
[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
}
-- Built-ins that are strict only (never compile-time expandable)
local builtins_strict = {
[ffi.new] = true,
[print] = true,
}
-- Return struct member size/type (requires LuaJIT 2.1+)
-- I am ashamed that there's no easier way around it.
local function sizeofattr(ct, name)
if not ffi.typeinfo then error('LuaJIT 2.1+ is required for ffi.typeinfo') end
local cinfo = ffi.typeinfo(ct)
while true do
cinfo = ffi.typeinfo(cinfo.sib)
if not cinfo then return end
if cinfo.name == name then break end
end
local size = math.max(1, ffi.typeinfo(cinfo.sib or ct).size - cinfo.size)
-- Guess type name
return size, builtins.width_type(size)
end
-- Return true if the constant part is a proxy
local function is_proxy(x)
return type(x) == 'table' and (x.__dissector or x.__map or x.__base)
end
-- Create compiler closure
local function create_emitter(env, stackslots, params, param_types)
local V = {} -- Variable tracking / register allocator
local code = { -- Generated code
pc = 0, bc_pc = 0,
insn = ffi.new('struct bpf_insn[4096]'),
fixup = {},
reachable = true,
seen_cmp = nil,
}
local Vstate = {} -- Track variable layout at basic block exits
-- Anything below this stack offset is free to use by caller
-- @note: There is no tracking memory allocator, so the caller may
-- lower it for persistent objects, but such memory will never
-- be reclaimed and the caller is responsible for resetting stack
-- top whenever the memory below is free to be reused
local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t')
local function emit(op, dst, src, off, imm)
local ins = code.insn[code.pc]
ins.code = op
ins.dst_reg = dst
ins.src_reg = src
ins.off = off
ins.imm = imm
code.pc = code.pc + 1
end
local function reg_spill(var)
local vinfo = V[var]
vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width)
emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0)
vinfo.reg = nil
end
local function reg_fill(var, reg)
local vinfo = V[var]
assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled')
emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0)
vinfo.reg = reg
vinfo.spill = nil
end
-- Allocate a register (lazy simple allocator)
local function reg_alloc(var, reg)
-- Specific register requested, must spill/move existing variable
if reg then
for k,v in pairs(V) do -- Spill any variable that has this register
if v.reg == reg and not v.shadow then
reg_spill(k)
break
end
end
return reg
end
-- Find free or least recently used slot
local last, last_seen, used = nil, 0xffff, 0
for k,v in pairs(V) do
if v.reg then
if not v.live_to or v.live_to < last_seen then
last, last_seen = k, v.live_to or last_seen
end
used = bit.bor(used, bit.lshift(1, v.reg))
end
end
-- Attempt to select a free register from R7-R9 (callee saved)
local free = bit.bnot(used)
if bit.band(free, 0x80) ~= 0 then reg = 7
elseif bit.band(free,0x100) ~= 0 then reg = 8
elseif bit.band(free,0x200) ~= 0 then reg = 9
end
-- Select another variable to be spilled
if not reg then
assert(last)
reg = V[last].reg
reg_spill(last)
end
assert(reg, 'VAR '..var..'fill/spill failed')
return reg
end
-- Set new variable
local function vset(var, reg, const, vtype)
-- Must materialise all variables shadowing this variable slot, as it will be overwritten
if V[var] and V[var].reg then
for _, vinfo in pairs(V) do
-- Shadowing variable MUST share the same type and attributes,
-- but the register assignment may have changed
if vinfo.shadow == var then
vinfo.reg = V[var].reg
vinfo.shadow = nil
end
end
end
-- Get precise type for CDATA or attempt to narrow numeric constant
if not vtype and type(const) == 'cdata' then vtype = ffi.typeof(const) end
V[var] = {reg=reg, const=const, type=vtype}
end
-- Materialize (or register) a variable in a register
-- If the register is nil, then the a new register is assigned (if not already assigned)
local function vreg(var, reg, reserve, vtype)
local vinfo = V[var]
assert(vinfo, 'VAR '..var..' not registered')
vinfo.live_to = code.pc-1
if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end
reg = reg_alloc(var, reg)
-- Materialize variable shadow copy
local src = vinfo
while src.shadow do src = V[src.shadow] end
if reserve then
-- No load to register occurs
elseif src.reg then
emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0)
elseif src.spill then
vinfo.spill = src.spill
reg_fill(var, reg)
elseif src.const then
vtype = vtype or src.type
if type(src.const) == 'table' and src.const.__base then
-- Load pointer type
emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0)
emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base)
elseif type(src.const) == 'table' and src.const.__dissector then
-- Load dissector offset (imm32), but keep the constant part (dissector proxy)
emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0)
elseif vtype and ffi.sizeof(vtype) == 8 then
-- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const))
emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16)))
vinfo.const = nil -- The variable is live
else
emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const)
vinfo.const = nil -- The variable is live
end
else assert(false, 'VAR '..var..' has neither register nor constant value') end
vinfo.reg = reg
vinfo.shadow = nil
vinfo.live_from = code.pc-1
vinfo.type = vtype or vinfo.type
return reg
end
-- Copy variable
local function vcopy(dst, src)
if dst == src then return end
V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type}
end
-- Dereference variable of pointer type
local function vderef(dst_reg, src_reg, vtype)
-- Dereference map pointers for primitive types
-- BPF doesn't allow pointer arithmetics, so use the entry value
local w = ffi.sizeof(vtype)
assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes')
if dst_reg ~= src_reg then
emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0) -- dst = src
end
emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0) -- if (src != NULL)
emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) -- dst = *src;
end
-- Allocate a space for variable
local function valloc(size, blank)
local base = stack_top
assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB')
stack_top = stack_top + size
-- Align to 8 byte boundary
stack_top = math.ceil(stack_top/8)*8
-- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK
-- so we always need to have memory initialized, remove this when supported
if blank then
if type(blank) == 'string' then
local sp = 0
while sp < size do
-- TODO: no BPF_ST + BPF_DW instruction yet
local as_u32 = ffi.new('uint32_t [1]')
local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32))
ffi.copy(as_u32, sub, #sub)
emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0])
sp = sp + ffi.sizeof(as_u32)
end
elseif type(blank) == 'boolean' then
reg_alloc(stackslots, 0)
emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
for sp = base+8,stack_top,8 do
emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0)
end
else error('NYI: will with unknown type '..type(blank)) end
end
return stack_top
end
-- Emit compensation code at the end of basic block to unify variable set layout on all block exits
-- 1. we need to free registers by spilling
-- 2. fill registers to match other exits from this BB
local function bb_end(Vcomp)
for i,v in pairs(V) do
if Vcomp[i] and Vcomp[i].spill and not v.spill then
reg_spill(i)
end
end
for i,v in pairs(V) do
if Vcomp[i] and Vcomp[i].reg and not v.reg then
vreg(i, Vcomp[i].reg)
end
end
end
local function LD_ABS(dst, off, w)
local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
-- assert(w < 8, 'NYI: LD_ABS64 is not supported') -- IMM64 has two IMM32 insns fused together
emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off)
end
local function LD_IND(dst, src, w, off)
local src_reg = vreg(src) -- Must materialize first in case dst == src
local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0)
end
local function LD_FIELD(a, d, w, imm)
if imm then
LD_ABS(a, imm, w)
else
LD_IND(a, d, w)
end
end
-- @note: This is specific now as it expects registers reserved
local function LD_IMM_X(dst_reg, src_type, imm, w)
if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm))
-- Must shift in two steps as bit.lshift supports [0..31]
emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16)))
else
emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm)
end
end
local function LOAD(dst, src, off, vtype)
local base = V[src].const
assert(base.__dissector, 'NYI: load() on variable that doesnt have dissector')
-- Cast to different type if requested
vtype = vtype or base.__dissector
local w = ffi.sizeof(vtype)
assert(w <= 4, 'NYI: load() supports 1/2/4 bytes at a time only')
if base.off then -- Absolute address to payload
LD_ABS(dst, off + base.off, w)
else -- Indirect address to payload
LD_IND(dst, src, w, off)
end
V[dst].type = vtype
V[dst].const = nil -- Dissected value is not constant anymore
end
local function CMP_STR(a, b, op)
assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=')
-- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP
-- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ...
-- EQ(a,b) <=> X == 0
-- This could be optimised by placing early exits by rewriter in second phase for long strings
local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type))
local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1)
local sp = 0
emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0)
while sp < size do
-- Load string chunk as imm32
local as_u32 = ffi.new('uint32_t [1]')
local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32))
ffi.copy(as_u32, sub, #sub)
-- TODO: make this faster by interleaved load/compare steps with DW length
emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0)
emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0])
emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0)
sp = sp + ffi.sizeof(as_u32)
end
emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0)
code.seen_cmp = code.pc-1
end
local function CMP_REG(a, b, op)
-- Fold compile-time expressions
if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER
else
-- Comparison against compile-time string or stack memory
if V[b].const and type(V[b].const) == 'string' then
return CMP_STR(a, V[b].const, op)
end
-- The 0xFFFF target here has no significance, it's just a placeholder for
-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
-- offset in BPF program code, verifier will accept only programs with valid JMP targets
local a_reg, b_reg = vreg(a), vreg(b)
-- Migrate operands from R0-5 as it will be spilled in compensation code when JMP out of BB
if a_reg == 0 then a_reg = vreg(a, 7) end
emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0)
code.seen_cmp = code.pc-1
end
end
local function CMP_IMM(a, b, op)
if V[a].const and not is_proxy(V[a].const) then -- Fold compile-time expressions
code.seen_cmp = const_expr[op](V[a].const, b) and ALWAYS or NEVER
else
-- Convert imm32 to number
if type(b) == 'string' then
if #b == 1 then b = b:byte()
elseif cdef.isptr(V[a].type) then
-- String comparison between stack/constant string
return CMP_STR(a, b, op)
elseif #b <= 4 then
-- Convert to u32 with network byte order
local imm = ffi.new('uint32_t[1]')
ffi.copy(imm, b, #b)
b = builtins.hton(imm[0])
else error('NYI: compare register with string, where #string > sizeof(u32)') end
end
-- The 0xFFFF target here has no significance, it's just a placeholder for
-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
-- offset in BPF program code, verifier will accept only programs with valid JMP targets
local reg = vreg(a)
-- Migrate operands from R0-5 as it will be spilled in compensation code when JMP out of BB
if reg == 0 then reg = vreg(a, 7) end
emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b)
code.seen_cmp = code.pc-1
end
end
local function ALU_IMM(dst, a, b, op)
-- Fold compile-time expressions
if V[a].const and not is_proxy(V[a].const) then
assert(type(V[a].const) == 'number', 'VAR '..a..' must be numeric')
vset(dst, nil, const_expr[op](V[a].const, b))
-- Now we need to materialize dissected value at DST, and add it
else
vcopy(dst, a)
local dst_reg = vreg(dst)
if cdef.isptr(V[a].type) then
vderef(dst_reg, dst_reg, V[a].const.__dissector)
V[dst].type = V[a].const.__dissector
else
V[dst].type = V[a].type
end
emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b)
end
end
local function ALU_REG(dst, a, b, op)
-- Fold compile-time expressions
if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
assert(type(V[a].const) == 'number', 'VAR '..a..' must be numeric')
assert(type(V[b].const) == 'number', 'VAR '..b..' must be numeric')
if type(op) == 'string' then op = const_expr[op] end
vcopy(dst, a)
V[dst].const = op(V[a].const, V[b].const)
else
local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations
if b and cdef.isptr(V[b].type) then
-- We have to allocate a temporary register for dereferencing to preserve
-- pointer in source variable that MUST NOT be altered
reg_alloc(stackslots, 2)
vderef(2, src_reg, V[b].const.__dissector)
src_reg = 2
end
vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B
local dst_reg = vreg(dst)
if cdef.isptr(V[a].type) then
vderef(dst_reg, dst_reg, V[a].const.__dissector)
V[dst].type = V[a].const.__dissector
end
emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0)
V[stackslots].reg = nil -- Free temporary registers
end
end
local function ALU_IMM_NV(dst, a, b, op)
-- Do DST = IMM(a) op VAR(b) where we can't invert because
-- the registers are u64 but immediates are u32, so complement
-- arithmetics wouldn't work
vset(stackslots+1, nil, a)
ALU_REG(dst, stackslots+1, b, op)
end
local function BUILTIN(func, ...)
local builtin_export = {
-- Compiler primitives (work with variable slots, emit instructions)
V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit,
reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width,
-- Extensions and helpers (use with care)
LD_IMM_X = LD_IMM_X,
}
func(builtin_export, ...)
end
local function CALL(a, b, d)
assert(b-1 <= 1, 'NYI: CALL with >1 return values')
-- Perform either compile-time, helper, or builtin
local func = V[a].const
-- Gather all arguments and check if they're constant
local args, const, nargs = {}, true, d - 1
for i = a+1, a+d-1 do
table.insert(args, V[i].const)
if not V[i].const or is_proxy(V[i].const) then const = false end
end
local builtin = builtins[func]
if not const or nargs == 0 then
if builtin and type(builtin) == 'function' then
args = {a}
for i = a+1, a+nargs do table.insert(args, i) end
BUILTIN(builtin, unpack(args))
elseif V[a+2] and V[a+2].const then -- var OP imm
ALU_IMM(a, a+1, V[a+2].const, builtin)
elseif nargs <= 2 then -- var OP var
ALU_REG(a, a+1, V[a+2] and a+2, builtin)
else
error('NYI: CALL non-builtin with 3 or more arguments')
end
-- Call on dissector implies slice retrieval
elseif type(func) == 'table' and func.__dissector then
assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments')
assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant')
local off = V[a+1].const
local vtype = builtins.width_type(V[a+2].const - off)
LOAD(a, a, off, vtype)
-- Strict builtins cannot be expanded on compile-time
elseif builtins_strict[func] and builtin then
args = {a}
for i = a+1, a+nargs do table.insert(args, i) end
BUILTIN(builtin, unpack(args))
-- Attempt compile-time call expansion (expects all argument compile-time known)
else
V[a].const = func(unpack(args))
end
end
local function MAP_INIT(map_var, key, imm)
local map = V[map_var].const
vreg(map_var, 1, true, ffi.typeof('uint64_t'))
-- Reserve R1 and load ptr for process-local map fd
LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type))
V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation
-- Reserve R2 and load R2 = key pointer
local key_size = ffi.sizeof(map.key_type)
local w = const_width[key_size] or BPF.DW
local pod_type = const_width[key_size]
local sp = stack_top + key_size -- Must use stack below spill slots
-- Store immediate value on stack
reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable)
local key_base = key and V[key].const
imm = imm or key_base
if imm and (not key or not is_proxy(key_base)) then
assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8')
emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm)
-- Key is in register, spill it
elseif V[key].reg and pod_type then
if cdef.isptr(V[key].type) then
-- There is already pointer in register, dereference before spilling
emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0)
emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0)
else -- Variable in register is POD, spill it on the stack
emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0)
end
-- Key is spilled from register to stack
elseif V[key].spill then
sp = V[key].spill
-- Key is already on stack, write to base-relative address
elseif key_base.__base then
assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type')
sp = key_base.__base
else
error('VAR '..key..' is neither const-expr/register/stack/spilled')
end
-- If [FP+K] addressing, emit it
if sp then
emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0)
emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp)
end
end
local function MAP_GET(dst, map_var, key, imm)
local map = V[map_var].const
MAP_INIT(map_var, key, imm)
-- Flag as pointer type and associate dissector for map value type
vreg(dst, 0, true, ffi.typeof('uint8_t *'))
V[dst].const = {__dissector=map.val_type}
emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem)
V[stackslots].reg = nil -- Free temporary registers
end
local function MAP_DEL(map_var, key, key_imm)
-- Set R0, R1 (map fd, preempt R0)
reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
MAP_INIT(map_var, key, key_imm)
emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem)
V[stackslots].reg = nil -- Free temporary registers
end
local function MAP_SET(map_var, key, key_imm, src)
local map = V[map_var].const
-- Delete when setting nil
if V[src].type == ffi.typeof('void') then
return MAP_DEL(map_var, key, key_imm)
end
-- Set R0, R1 (map fd, preempt R0)
reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
MAP_INIT(map_var, key, key_imm)
reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable)
emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing
-- Reserve R3 for value pointer
local val_size = ffi.sizeof(map.val_type)
local w = const_width[val_size] or BPF.DW
local pod_type = const_width[val_size]
-- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value)
local sp = stack_top + ffi.sizeof(map.key_type) + val_size
sp = sp + (sp % val_size)
local base = V[src].const
if base and not is_proxy(base) then
assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8')
emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base)
-- Value is in register, spill it
elseif V[src].reg and pod_type then
emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0)
-- We get a pointer to spilled register on stack
elseif V[src].spill then
-- If variable is a pointer, we can load it to R3 directly (save "LEA")
if cdef.isptr(V[src].type) then
reg_fill(src, 3)
emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
return
else
sp = V[src].spill
end
-- Value is already on stack, write to base-relative address
elseif base.__base then
assert(val_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map value type')
sp = base.__base
-- Value is constant, materialize it on stack
else
error('VAR '..key..' is neither const-expr/register/stack/spilled')
end
reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable)
emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0)
emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp)
emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
V[stackslots].reg = nil -- Free temporary registers
end
-- Finally - this table translates LuaJIT bytecode into code emitter actions.
local BC = {
-- Constants
KNUM = function(a, _, c, _) -- KNUM
vset(a, nil, c, ffi.typeof('int32_t')) -- TODO: only 32bit immediates are supported now
end,
KSHORT = function(a, _, _, d) -- KSHORT
vset(a, nil, d, ffi.typeof('int16_t'))
end,
KPRI = function(a, _, _, d) -- KPRI
-- KNIL is 0, must create a special type to identify it
local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t')
vset(a, nil, (d < 2) and 0 or 1, vtype)
end,
KSTR = function(a, _, c, _) -- KSTR
vset(a, nil, c, ffi.typeof('const char[?]'))
end,
MOV = function(a, _, _, d) -- MOV var, var
vcopy(a, d)
end,
-- Comparison ops
-- Note: comparisons are always followed by JMP opcode, that
-- will fuse following JMP to JMP+CMP instruction in BPF
-- Note: we're narrowed to integers, so operand/operator inversion is legit
ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted)
ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d)
ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d)
ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d)
ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d)
ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c))
ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c))
ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c)
ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c)
IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d)
ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d)
ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c)
-- Binary operations with RHS constants
ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end,
SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end,
MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end,
DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end,
MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end,
-- Binary operations with LHS constants
-- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative
ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV
MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV
SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV
DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV
-- Binary operations between registers
ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end,
SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end,
MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end,
DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end,
MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end,
-- Strings
CAT = function(a, b, _, d) -- CAT A = B ~ D
assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions')
assert(type(V[b].const) == 'string' and type(V[d].const) == 'string',
'NYI: CAT only works on compile-time strings')
vset(a, nil, V[b].const .. V[d].const)
end,
-- Tables
GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c])
if env[c] ~= nil then
vset(a, nil, env[c])
else error(string.format("undefined global '%s'", c)) end
end,
UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c])
if env[c] ~= nil then
vset(a, nil, env[c])
else error(string.format("undefined upvalue '%s'", c)) end
end,
TGETB = function (a, b, _, d) -- TGETB (A = B[D])
if a ~= b then vset(a) end
local base = V[b].const
if base.__map then -- BPF map read (constant)
MAP_GET(a, b, nil, d)
else
LOAD(a, b, d, ffi.typeof('uint8_t'))
end
end,
TSETB = function (a, b, _, d) -- TSETB (B[D] = A)
if V[b].const.__map then -- BPF map read (constant)
return MAP_SET(b, nil, d, a) -- D is literal
elseif V[b].const and V[b].const and V[a].const then
V[b].const[V[d].const] = V[a].const
else error('NYI: B[D] = A, where B is not Lua table or BPF map')
end
end,
TSETV = function (a, b, _, d) -- TSETV (B[D] = A)
if V[b].const.__map then -- BPF map read (constant)
return MAP_SET(b, d, nil, a) -- D is variable
elseif V[b].const and V[d].const and V[a].const then
V[b].const[V[d].const] = V[a].const
else error('NYI: B[D] = A, where B is not Lua table or BPF map')
end
end,
TSETS = function (a, b, c, _) -- TSETS (B[C] = A)
assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table or BPF map')
local base = V[b].const
if base.__dissector then
local ofs,bpos = ffi.offsetof(base.__dissector, c)
assert(not bpos, 'NYI: B[C] = A, where C is a bitfield')
local w = sizeofattr(base.__dissector, c)
-- TODO: support vectorized moves larger than register width
assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
local src_reg = vreg(a)
-- If source is a pointer, we must dereference it first
if cdef.isptr(V[a].type) then
local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register
emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0)
vderef(tmp_reg, tmp_reg, V[a].const.__dissector)
src_reg = tmp_reg -- Materialize and dereference it
-- Source is a value on stack, we must load it first
elseif V[a].const and V[a].const.__base > 0 then
emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0)
V[a].type = V[a].const.__dissector
V[a].const = nil -- Value is dereferenced
end
-- If the table is not on stack, it must be checked for NULL
if not base.__base then
emit(BPF.JMP + BPF.JEQ + BPF.K, V[b].reg, 0, 1, 0) -- if (map[x] != NULL)
emit(BPF.MEM + BPF.STX + const_width[w], V[b].reg, src_reg, ofs, 0)
else -- Table is already on stack, write to base-relative address
emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0)
end
elseif V[a].const then
base[c] = V[a].const
else error('NYI: B[C] = A, where B is not Lua table or BPF map')
end
end,
TGETV = function (a, b, _, d) -- TGETV (A = B[D])
assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table or BPF map')
if a ~= b then vset(a) end
if V[b].const.__map then -- BPF map read
MAP_GET(a, b, d)
elseif V[b].const == env.pkt then -- Raw packet, no offset
LD_FIELD(a, d, 1, V[d].const)
else V[a].const = V[b].const[V[d].const] end
end,
TGETS = function (a, b, c, _) -- TGETS (A = B[C])
assert(V[b] and V[b].const, 'NYI: B[C] where C is string and B not Lua table or BPF map')
local base = V[b].const
if type(base) == 'table' and base.__dissector then
local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
-- Resolve table key using metatable
if not ofs and type(base.__dissector[c]) == 'string' then
c = base.__dissector[c]
ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
end
if not ofs and proto[c] then -- Load new dissector on given offset
BUILTIN(proto[c], a, b, c)
else
assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists')
if a ~= b then vset(a) end
-- Dissected value is probably not constant anymore
local new_const = nil
-- Simple register load, get absolute offset or R-relative
local w, atype = sizeofattr(base.__dissector, c)
if base.__base == true then -- R-relative addressing
local dst_reg = vreg(a, nil, true)
assert(const_width[w], 'NYI: sizeof('..tostring(base.__dissector)..'.'..c..') not 1/2/4/8 bytes')
emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, V[b].reg, ofs, 0)
elseif not base.source and base.__base and base.__base > 0 then -- [FP+K] addressing
if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset
new_const = {__base = base.__base-ofs}
else
local dst_reg = vreg(a, nil, true)
emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0)
end
elseif base.off then -- Absolute address to payload
LD_ABS(a, ofs + base.off, w)
elseif base.source == 'probe' then -- Indirect read using probe
BUILTIN(builtins[builtins.probe_read], nil, a, b, atype, ofs)
V[a].source = V[b].source -- Builtin handles everything
return
else -- Indirect address to payload
LD_IND(a, b, w, ofs)
end
-- Bitfield, must be further narrowed with a bitmask/shift
if bpos then
local mask = 0
for i=bpos+1,bpos+bsize do
mask = bit.bor(mask, bit.lshift(1, w*8-i))
end
emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask)
-- Free optimization: single-bit values need just boolean result
if bsize > 1 then
local shift = w*8-bsize-bpos
if shift > 0 then
emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift)
end
end
end
V[a].type = atype
V[a].const = new_const
V[a].source = V[b].source
end
else V[a].const = base[c] end
end,
-- Loops and branches
CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES)
-- NYI: Support single result only
CALL(a, b, d+2)
end,
CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1)
CALL(a, b, d)
end,
JMP = function (a, _, c, d) -- JMP
-- Discard unused slots after jump
for i, _ in pairs(V) do
if i >= a then V[i] = {} end
end
local val = code.fixup[c] or {}
if code.seen_cmp and code.seen_cmp ~= ALWAYS then
if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup
-- Store previous CMP insn for reemitting after compensation code
local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1])
code.pc = code.pc - 1
-- First branch point, emit compensation code
local Vcomp = Vstate[c]
if not Vcomp then
for i,v in pairs(V) do
if not v.reg and v.const and not is_proxy(v.const) then
vreg(i, 0) -- Load to TMP register (not saved)
end
if v.reg and v.reg <= 5 then
reg_spill(i) -- Spill caller-saved registers
end
end
-- Record variable state
Vstate[c] = V
V = {}
for i,v in pairs(Vstate[c]) do
V[i] = {}
for k,e in pairs(v) do
V[i][k] = e
end
end
-- Variable state already set, emit specific compensation code
else bb_end(Vcomp) end
-- Reemit CMP insn
emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm)
-- Fuse JMP into previous CMP opcode, mark JMP target for fixup
-- as we don't knot the relative offset in generated code yet
table.insert(val, code.pc-1)
code.fixup[c] = val
end
code.seen_cmp = nil
else
emit(BPF.JMP + BPF.JEQ + BPF.X, 6, 6, 0xffff, 0) -- Always true
table.insert(val, code.pc-1) -- Fixup JMP target
code.reachable = false -- Code following the JMP is not reachable
code.fixup[c] = val
end
end,
RET1 = function (a, _, _, _) -- RET1
if V[a].reg ~= 0 then vreg(a, 0) end
emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
-- Free optimisation: spilled variable will not be filled again
for _,v in pairs(V) do if v.reg == 0 then v.reg = nil end end
code.reachable = false
end,
RET0 = function (_, _, _, _) -- RET0
emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
code.reachable = false
end,
compile = function ()
return code
end
}
-- Always initialize R6 with R1 context
emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0)
-- Register R6 as context variable (first argument)
if params and params > 0 then
vset(0, 6, param_types[1] or proto.skb)
V[0].source = V[0].const.source -- Propagate source annotation from typeinfo
end
-- Register tmpvars
vset(stackslots)
vset(stackslots+1)
return setmetatable(BC, {
__index = function (t, k, v)
if type(k) == 'number' then
local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6)
error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str))
end
end,
__call = function (t, op, a, b, c, d)
code.bc_pc = code.bc_pc + 1
-- Exitting BB straight through, emit compensation code
if Vstate[code.bc_pc] and code.reachable then
bb_end(Vstate[code.bc_pc])
end
-- Perform fixup of jump targets
-- We need to do this because the number of consumed and emited
-- bytecode instructions is different
local fixup = code.fixup[code.bc_pc]
if fixup ~= nil then
-- Patch JMP source insn with relative offset
for _,pc in ipairs(fixup) do
code.insn[pc].off = code.pc - 1 - pc
end
code.fixup[code.bc_pc] = nil
code.reachable = true
end
-- Execute
if code.reachable then
assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d))
return t[op](a, b, c, d)
end
end,
})
end
-- Emitted code dump
local function dump_mem(cls, ins)
local mode = bit.band(ins.code, 0xe0)
if mode == BPF.XADD then cls = 5 end -- The only mode
local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'}
local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'}
local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)]
local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed
local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off)
local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg
if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end
if mode == BPF.ABS then src = string.format('[%d]', ins.imm) end
if mode == BPF.IND then src = string.format('[R%d%+d]', ins.src_reg, ins.imm) end
return string.format('%s\t%s\t%s', name, dst, src)
end
local function dump_alu(cls, ins, pc)
local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' }
local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'}
local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns',
'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes',
'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid',
'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop',
'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm',
'perf_event_output', 'skb_load_bytes'}
local op = 0
for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end
local name = (cls == 5) and jmp[op] or alu[op]
local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm
local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or ''
if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end
return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target)
end
local function dump(code)
if not code then return end
print(string.format('-- BPF %s:0-%u', code.insn, code.pc))
local cls_map = {
[0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem,
[4] = dump_alu, [5] = dump_alu, [7] = dump_alu,
}
for i = 0, code.pc - 1 do
local ins = code.insn[i]
local cls = bit.band(ins.code, 0x07)
print(string.format('%04u\t%s', i, cls_map[cls](cls, ins, i)))
end
end
local function compile(prog, params)
-- Create code emitter sandbox, include caller locals
local env = { pkt=proto.pkt, BPF=BPF }
-- Include upvalues up to 4 nested scopes back
-- the narrower scope overrides broader scope
for k = 5, 2, -1 do
local i = 1
while true do
local ok, n, v = pcall(debug.getlocal, k, i)
if not ok or not n then break end
env[n] = v
i = i + 1
end
end
setmetatable(env, {
__index = function (_, k)
return proto[k] or builtins[k] or _G[k]
end
})
-- Create code emitter and compile LuaJIT bytecode
if type(prog) == 'string' then prog = loadstring(prog) end
-- Create error handler to print traceback
local funci, pc = bytecode.funcinfo(prog), 0
local E = create_emitter(env, funci.stackslots, funci.params, params or {})
local on_err = function (e)
funci = bytecode.funcinfo(prog, pc)
local from, to = 0, 0
for _ = 1, funci.currentline do
from = to
to = string.find(funci.source, '\n', from+1, true) or 0
end
print(funci.loc..':'..string.sub(funci.source, from+1, to-1))
print('error: '..e)
print(debug.traceback())
end
for _,op,a,b,c,d in bytecode.decoder(prog) do
local ok, res, err = xpcall(E,on_err,op,a,b,c,d)
if not ok then
return nil, res, err
end
end
return E:compile()
end
-- BPF map interface
local bpf_map_mt = {
__gc = function (map) S.close(map.fd) end,
__len = function(map) return map.max_entries end,
__index = function (map, k)
if type(k) == 'string' then
-- Return iterator
if k == 'pairs' then
return function(t, key)
-- Get next key
local next_key = ffi.new(ffi.typeof(t.key))
local cur_key
if key then
cur_key = t.key
t.key[0] = key
else
cur_key = ffi.new(ffi.typeof(t.key))
end
local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key)
if not ok then return nil end
-- Get next value
assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val))
return next_key[0], map.val[0]
end, map, nil
-- Read for perf event map
elseif k == 'reader' then
return function (pmap, pid, cpu, event_type)
-- Caller must either specify PID or CPU
if not pid or pid < 0 then
assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs')
pid = -1
end
-- Create BPF output reader
local pe = S.t.perf_event_attr1()
pe[0].type = 'software'
pe[0].config = 'sw_bpf_output'
pe[0].sample_type = 'raw'
pe[0].sample_period = 1
pe[0].wakeup_events = 1
local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1))
if not reader then return nil, tostring(err) end
-- Register event reader fd in BPF map
assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu))
pmap[cpu] = reader.fd
-- Open memory map and start reading
local ok, err = reader:start()
assert(ok, tostring(err))
ok, err = reader:mmap()
assert(ok, tostring(err))
return cdef.event_reader(reader, event_type)
end
-- Signalise this is a map type
end
return k == '__map'
end
-- Retrieve key
map.key[0] = k
local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val)
if not ok then return nil, err end
return ffi.new(map.val_type, map.val[0])
end,
__newindex = function (map, k, v)
map.key[0] = k
if v == nil then
return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil)
end
map.val[0] = v
return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val)
end,
}
-- Linux tracing interface
local function trace_check_enabled(path)
path = path or '/sys/kernel/debug/tracing'
if S.statfs(path) then return true end
return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?'
end
-- Tracepoint interface
local tracepoint_mt = {
__index = {
bpf = function (t, prog)
if type(prog) ~= 'table' then
-- Create protocol parser with source=probe
prog = compile(prog, {proto.type(t.type, {source='probe'})})
end
-- Load the BPF program
local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc)
assert(prog_fd, tostring(err)..': '..tostring(log))
-- Open tracepoint and attach
t.reader:setbpf(prog_fd:getfd())
table.insert(t.progs, prog_fd)
return prog_fd
end,
}
}
-- Open tracepoint
local function tracepoint_open(path, pid, cpu, group_fd)
-- Open tracepoint and compile tracepoint type
local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path))
local tp_type = assert(cdef.tracepoint_type(path))
-- Open tracepoint reader and create interface
local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd))
return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt)
end
local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd)
-- Load BPF program
if type(prog) ~= 'table' then
prog = compile(prog, {proto.pt_regs})
end
local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc)
assert(prog_fd, tostring(err)..': '..tostring(log))
-- Open tracepoint and attach
local tp, err = S.perf_probe(ptype, pname, pdef, retprobe)
if not tp then
prog_fd:close()
return nil, tostring(err)
end
local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'})
if not reader then
prog_fd:close()
S.perf_probe(ptype, pname, false)
return nil, tostring(err)
end
local ok, err = reader:setbpf(prog_fd:getfd())
if not ok then
prog_fd:close()
reader:close()
S.perf_probe(ptype, pname, false)
return nil, tostring(err)..' (kernel version should be at least 4.1)'
end
-- Create GC closure for reader to close BPF program
-- and detach probe in correct order
ffi.gc(reader, function ()
prog_fd:close()
reader:close()
S.perf_probe(ptype, pname, false)
end)
return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype}
end
-- Module interface
return setmetatable({
new = create_emitter,
dump = dump,
maps = {},
map = function (type, max_entries, key_ctype, val_ctype)
if not key_ctype then key_ctype = ffi.typeof('uint32_t') end
if not val_ctype then val_ctype = ffi.typeof('uint32_t') end
if not max_entries then max_entries = 4096 end
-- Special case for BPF_MAP_STACK_TRACE
if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then
key_ctype = ffi.typeof('int32_t')
val_ctype = ffi.typeof('struct bpf_stacktrace')
end
local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries)
if not fd then return nil, tostring(err) end
local map = setmetatable({
max_entries = max_entries,
key = ffi.new(ffi.typeof('$ [1]', key_ctype)),
val = ffi.new(ffi.typeof('$ [1]', val_ctype)),
map_type = S.c.BPF_MAP[type],
key_type = key_ctype,
val_type = val_ctype,
fd = fd:nogc():getfd(),
}, bpf_map_mt)
return map
end,
socket = function (sock, prog)
-- Expect socket type, if sock is string then assume it's
-- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket
local ok, err
if type(sock) == 'string' then
local iface = assert(S.nl.getlink())[sock]
assert(iface, sock..' is not interface name')
sock, err = S.socket('packet', 'raw')
assert(sock, tostring(err))
ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index}))
assert(ok, tostring(err))
elseif type(sock) == 'number' then
sock = assert(S.t.socket(sock))
end
-- Load program and attach it to socket
if type(prog) ~= 'table' then
prog = compile(prog, {proto.skb})
end
local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc)
assert(prog_fd, tostring(err)..': '..tostring(log))
assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd()))
return prog_fd, err
end,
tracepoint = function(tp, prog, pid, cpu, group_fd)
assert(trace_check_enabled())
-- Return tracepoint instance if no program specified
-- this allows free specialisation of arg0 to tracepoint type
local probe = tracepoint_open(tp, pid, cpu, group_fd)
-- Load the BPF program
if prog then
probe:bpf(prog)
end
return probe
end,
kprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
assert(trace_check_enabled())
-- Open tracepoint and attach
local pname, pdef = tp:match('([^:]+):(.+)')
return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
end,
uprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
assert(trace_check_enabled())
-- Translate symbol to address
local obj, sym_want = tp:match('([^:]+):(.+)')
if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end
-- Resolve Elf object (no support for anything else)
local elf = require('bpf.elf').open(obj)
local sym = elf:resolve(sym_want)
if not sym then return nil, 'no such symbol' end
sym = sym.st_value - elf:loadaddr()
local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)),
tonumber(ffi.cast('uint32_t', sym)))
-- Convert it to expected uprobe format
local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr)
local pdef = obj..':0x'..sym_addr
return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
end,
tracelog = function(path)
assert(trace_check_enabled())
path = path or '/sys/kernel/debug/tracing/trace_pipe'
return io.open(path, 'r')
end,
ntoh = builtins.ntoh, hton = builtins.hton,
}, {
__call = function (t, prog) return compile(prog) end,
})
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
local ffi = require('ffi')
local bit = require('bit')
local cdef = require('bpf.cdef')
local BPF, HELPER = ffi.typeof('struct bpf'), ffi.typeof('struct bpf_func_id')
local const_width = {
[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
}
local const_width_type = {
[1] = ffi.typeof('uint8_t'), [2] = ffi.typeof('uint16_t'), [4] = ffi.typeof('uint32_t'), [8] = ffi.typeof('uint64_t'),
}
-- Built-ins that will be translated into BPF instructions
-- i.e. bit.bor(0xf0, 0x0f) becomes {'alu64, or, k', reg(0xf0), reg(0x0f), 0, 0}
local builtins = {
[bit.lshift] = 'LSH',
[bit.rshift] = 'RSH',
[bit.band] = 'AND',
[bit.bnot] = 'NEG',
[bit.bor] = 'OR',
[bit.bxor] = 'XOR',
[bit.arshift] = 'ARSH',
-- Extensions and intrinsics
}
local function width_type(w)
-- Note: ffi.typeof doesn't accept '?' as template
return const_width_type[w] or ffi.typeof(string.format('uint8_t [%d]', w))
end
builtins.width_type = width_type
-- Byte-order conversions for little endian
local function ntoh(x, w)
if w then x = ffi.cast(const_width_type[w/8], x) end
return bit.bswap(x)
end
local function hton(x, w) return ntoh(x, w) end
builtins.ntoh = ntoh
builtins.hton = hton
builtins[ntoh] = function (e, dst, a, w)
-- This is trickery, but TO_LE means cpu_to_le(),
-- and we want exactly the opposite as network is always 'be'
w = w or ffi.sizeof(e.V[a].type)*8
if w == 8 then return end -- NOOP
assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width')
-- Allocate registers and execute
e.vcopy(dst, a)
e.emit(BPF.ALU + BPF.END + BPF.TO_BE, e.vreg(dst), 0, 0, w)
end
builtins[hton] = function (e, dst, a, w)
w = w or ffi.sizeof(e.V[a].type)*8
if w == 8 then return end -- NOOP
assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width')
-- Allocate registers and execute
e.vcopy(dst, a)
e.emit(BPF.ALU + BPF.END + BPF.TO_LE, e.vreg(dst), 0, 0, w)
end
-- Byte-order conversions for big endian are no-ops
if ffi.abi('be') then
ntoh = function (x, w)
return w and ffi.cast(const_width_type[w/8], x) or x
end
hton = ntoh
builtins[ntoh] = function(a, b, w) return end
builtins[hton] = function(a, b, w) return end
end
-- Other built-ins
local function xadd(a, b) error('NYI') end
builtins.xadd = xadd
builtins[xadd] = function (e, dst, a, b, off)
assert(e.V[a].const.__dissector, 'xadd(a, b) called on non-pointer')
local w = ffi.sizeof(e.V[a].const.__dissector)
assert(w == 4 or w == 8, 'NYI: xadd() - 1 and 2 byte atomic increments are not supported')
-- Allocate registers and execute
e.vcopy(dst, a)
local src_reg = e.vreg(b)
local dst_reg = e.vreg(dst)
e.emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 1, 0) -- if (dst != NULL)
e.emit(BPF.XADD + BPF.STX + const_width[w], dst_reg, src_reg, off or 0, 0)
end
local function probe_read() error('NYI') end
builtins.probe_read = probe_read
builtins[probe_read] = function (e, ret, dst, src, vtype, ofs)
e.reg_alloc(e.tmpvar, 1)
-- Load stack pointer to dst, since only load to stack memory is supported
-- we have to use allocated stack memory or create a new allocation and convert
-- to pointer type
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
if not e.V[dst].const or not e.V[dst].const.__base > 0 then
builtins[ffi.new](e, dst, vtype) -- Allocate stack memory
end
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
-- Set stack memory maximum size bound
e.reg_alloc(e.tmpvar, 2)
if not vtype then
vtype = cdef.typename(e.V[dst].type)
-- Dereference pointer type to pointed type for size calculation
if vtype:sub(-1) == '*' then vtype = vtype:sub(0, -2) end
end
local w = ffi.sizeof(vtype)
e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, w)
-- Set source pointer
if e.V[src].reg then
e.reg_alloc(e.tmpvar, 3) -- Copy from original register
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0)
else
local src_reg = e.vreg(src, 3)
e.reg_spill(src) -- Spill to avoid overwriting
end
if ofs and ofs > 0 then
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, ofs)
end
-- Call probe read helper
ret = ret or e.tmpvar
e.vset(ret)
e.vreg(ret, 0, true, ffi.typeof('int32_t'))
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read)
e.V[e.tmpvar].reg = nil -- Free temporary registers
end
builtins[ffi.cast] = function (e, dst, ct, x)
assert(e.V[ct].const, 'ffi.cast(ctype, x) called with bad ctype')
e.vcopy(dst, x)
if not e.V[x].const then
e.V[dst].type = ffi.typeof(e.V[ct].const)
else
e.V[dst].const.__dissector = ffi.typeof(e.V[ct].const)
end
-- Specific types also encode source of the data
-- This is because BPF has different helpers for reading
-- different data sources, so variables must track origins.
-- struct pt_regs - source of the data is probe
-- struct skb - source of the data is socket buffer
-- struct X - source of the data is probe/tracepoint
if ffi.typeof(e.V[ct].const) == ffi.typeof('struct pt_regs') then
e.V[dst].source = 'probe'
end
end
builtins[ffi.new] = function (e, dst, ct, x)
if type(ct) == 'number' then
ct = ffi.typeof(e.V[ct].const) -- Get ctype from variable
end
assert(not x, 'NYI: ffi.new(ctype, ...) - initializer is not supported')
assert(not cdef.isptr(ct, true), 'NYI: ffi.new(ctype, ...) - ctype MUST NOT be a pointer')
e.vset(dst, nil, ct)
e.V[dst].const = {__base = e.valloc(ffi.sizeof(ct), true), __dissector = ct}
end
builtins[ffi.copy] = function (e,ret, dst, src)
assert(cdef.isptr(e.V[dst].type), 'ffi.copy(dst, src) - dst MUST be a pointer type')
assert(cdef.isptr(e.V[src].type), 'ffi.copy(dst, src) - src MUST be a pointer type')
-- Specific types also encode source of the data
-- struct pt_regs - source of the data is probe
-- struct skb - source of the data is socket buffer
if e.V[src].source == 'probe' then
e.reg_alloc(e.tmpvar, 1)
-- Load stack pointer to dst, since only load to stack memory is supported
-- we have to either use spilled variable or allocated stack memory offset
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
if e.V[dst].spill then
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].spill)
elseif e.V[dst].const.__base then
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
else error('ffi.copy(dst, src) - can\'t get stack offset of dst') end
-- Set stack memory maximum size bound
local dst_tname = cdef.typename(e.V[dst].type)
if dst_tname:sub(-1) == '*' then dst_tname = dst_tname:sub(0, -2) end
e.reg_alloc(e.tmpvar, 2)
e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(dst_tname))
-- Set source pointer
if e.V[src].reg then
e.reg_alloc(e.tmpvar, 3) -- Copy from original register
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0)
else
local src_reg = e.vreg(src, 3)
e.reg_spill(src) -- Spill to avoid overwriting
end
-- Call probe read helper
e.vset(ret)
e.vreg(ret, 0, true, ffi.typeof('int32_t'))
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read)
e.V[e.tmpvar].reg = nil -- Free temporary registers
elseif e.V[src].const and e.V[src].const.__map then
error('NYI: ffi.copy(dst, src) - src is backed by BPF map')
elseif e.V[src].const and e.V[src].const.__dissector then
error('NYI: ffi.copy(dst, src) - src is backed by socket buffer')
else
-- TODO: identify cheap register move
-- TODO: identify copy to/from stack
error('NYI: ffi.copy(dst, src) - src is neither BPF map/socket buffer or probe')
end
end
-- print(format, ...) builtin changes semantics from Lua print(...)
-- the first parameter has to be format and only reduced set of conversion specificers
-- is allowed: %d %u %x %ld %lu %lx %lld %llu %llx %p %s
builtins[print] = function (e, ret, fmt, a1, a2, a3)
-- Load format string and length
e.reg_alloc(e.V[e.tmpvar], 1)
e.reg_alloc(e.V[e.tmpvar+1], 1)
if type(e.V[fmt].const) == 'string' then
local src = e.V[fmt].const
local len = #src + 1
local dst = e.valloc(len, src)
-- TODO: this is materialize step
e.V[fmt].const = {__base=dst}
e.V[fmt].type = ffi.typeof('char ['..len..']')
elseif e.V[fmt].const.__base then -- NOP
else error('NYI: print(fmt, ...) - format variable is not literal/stack memory') end
-- Prepare helper call
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[fmt].const.__base)
e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[fmt].type))
if a1 then
local args = {a1, a2, a3}
assert(#args <= 3, 'print(fmt, ...) - maximum of 3 arguments supported')
for i, arg in ipairs(args) do
e.vcopy(e.tmpvar, arg) -- Copy variable
e.vreg(e.tmpvar, 3+i-1) -- Materialize it in arg register
end
end
-- Call helper
e.vset(ret)
e.vreg(ret, 0, true, ffi.typeof('int32_t')) -- Return is integer
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.trace_printk)
e.V[e.tmpvar].reg = nil -- Free temporary registers
end
-- Implements bpf_perf_event_output(ctx, map, flags, var, vlen) on perf event map
local function perf_submit(e, dst, map_var, src)
-- Set R2 = map fd (indirect load)
local map = e.V[map_var].const
e.vcopy(e.tmpvar, map_var)
e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t'))
e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t'))
-- Set R1 = ctx
e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
-- Set R3 = flags
e.vset(e.tmpvar, nil, 0) -- BPF_F_CURRENT_CPU
e.vreg(e.tmpvar, 3, false, ffi.typeof('uint64_t'))
-- Set R4 = pointer to src on stack
assert(e.V[src].const.__base, 'NYI: submit(map, var) - variable is not on stack')
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 4, 10, 0, 0)
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 4, 0, 0, -e.V[src].const.__base)
-- Set R5 = src length
e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 5, 0, 0, ffi.sizeof(e.V[src].type))
-- Set R0 = ret and call
e.vset(dst)
e.vreg(dst, 0, true, ffi.typeof('int32_t')) -- Return is integer
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.perf_event_output)
e.V[e.tmpvar].reg = nil -- Free temporary registers
end
-- Implements bpf_get_stack_id()
local function stack_id(e, ret, map_var, key)
-- Set R2 = map fd (indirect load)
local map = e.V[map_var].const
e.vcopy(e.tmpvar, map_var)
e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t'))
e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t'))
-- Set R1 = ctx
e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
-- Load flags in R2 (immediate value or key)
local imm = e.V[key].const
assert(tonumber(imm), 'NYI: stack_id(map, var), var must be constant number')
e.reg_alloc(e.tmpvar, 3) -- Spill anything in R2 (unnamed tmp variable)
e.LD_IMM_X(3, 0, imm, 8)
-- Return R0 as signed integer
e.vset(ret)
e.vreg(ret, 0, true, ffi.typeof('int32_t'))
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_stackid)
e.V[e.tmpvar].reg = nil -- Free temporary registers
end
-- table.insert(table, value) keeps semantics with the exception of BPF maps
-- map `perf_event` -> submit inserted value
builtins[table.insert] = function (e, dst, map_var, value)
assert(e.V[map_var].const.__map, 'NYI: table.insert() supported only on BPF maps')
return perf_submit(e, dst, map_var, value)
end
-- bpf_get_current_comm(buffer) - write current process name to byte buffer
local function comm() error('NYI') end
builtins[comm] = function (e, ret, dst)
-- Set R1 = buffer
assert(e.V[dst].const.__base, 'NYI: comm(buffer) - buffer variable is not on stack')
e.reg_alloc(e.tmpvar, 1) -- Spill
e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
-- Set R2 = length
e.reg_alloc(e.tmpvar, 2) -- Spill
e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[dst].type))
-- Return is integer
e.vset(ret)
e.vreg(ret, 0, true, ffi.typeof('int32_t'))
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_current_comm)
e.V[e.tmpvar].reg = nil -- Free temporary registers
end
-- Math library built-ins
math.log2 = function (x) error('NYI') end
builtins[math.log2] = function (e, dst, x)
-- Classic integer bits subdivison algorithm to find the position
-- of the highest bit set, adapted for BPF bytecode-friendly operations.
-- https://graphics.stanford.edu/~seander/bithacks.html
-- r = 0
local r = e.vreg(dst, nil, true)
e.emit(BPF.ALU64 + BPF.MOV + BPF.K, r, 0, 0, 0)
-- v = x
e.vcopy(e.tmpvar, x)
local v = e.vreg(e.tmpvar, 2)
if cdef.isptr(e.V[x].const) then -- No pointer arithmetics, dereference
e.vderef(v, v, ffi.typeof('uint64_t'))
end
-- Invert value to invert all tests, otherwise we would need and+jnz
e.emit(BPF.ALU64 + BPF.NEG + BPF.K, v, 0, 0, 0) -- v = ~v
-- Unrolled test cases, converted masking to arithmetic as we don't have "if !(a & b)"
-- As we're testing inverted value, we have to use arithmetic shift to copy MSB
for i=4,0,-1 do
local k = bit.lshift(1, i)
e.emit(BPF.JMP + BPF.JGT + BPF.K, v, 0, 2, bit.bnot(bit.lshift(1, k))) -- if !upper_half(x)
e.emit(BPF.ALU64 + BPF.ARSH + BPF.K, v, 0, 0, k) -- v >>= k
e.emit(BPF.ALU64 + BPF.OR + BPF.K, r, 0, 0, k) -- r |= k
end
-- No longer constant, cleanup tmpvars
e.V[dst].const = nil
e.V[e.tmpvar].reg = nil
end
builtins[math.log10] = function (e, dst, x)
-- Compute log2(x) and transform
builtins[math.log2](e, dst, x)
-- Relationship: log10(v) = log2(v) / log2(10)
local r = e.V[dst].reg
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1) -- Compensate round-down
e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 1233) -- log2(10) ~ 1233>>12
e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12)
end
builtins[math.log] = function (e, dst, x)
-- Compute log2(x) and transform
builtins[math.log2](e, dst, x)
-- Relationship: ln(v) = log2(v) / log2(e)
local r = e.V[dst].reg
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1) -- Compensate round-down
e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 2839) -- log2(e) ~ 2839>>12
e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12)
end
-- Call-type helpers
local function call_helper(e, dst, h)
e.vset(dst)
local dst_reg = e.vreg(dst, 0, true)
e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, h)
e.V[dst].const = nil -- Target is not a function anymore
end
local function cpu() error('NYI') end
local function rand() error('NYI') end
local function time() error('NYI') end
local function pid_tgid() error('NYI') end
local function uid_gid() error('NYI') end
-- Export helpers and builtin variants
builtins.cpu = cpu
builtins.time = time
builtins.pid_tgid = pid_tgid
builtins.uid_gid = uid_gid
builtins.comm = comm
builtins.perf_submit = perf_submit
builtins.stack_id = stack_id
builtins[cpu] = function (e, dst) return call_helper(e, dst, HELPER.get_smp_processor_id) end
builtins[rand] = function (e, dst) return call_helper(e, dst, HELPER.get_prandom_u32) end
builtins[time] = function (e, dst) return call_helper(e, dst, HELPER.ktime_get_ns) end
builtins[pid_tgid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_pid_tgid) end
builtins[uid_gid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_uid_gid) end
builtins[perf_submit] = function (e, dst, map, value) return perf_submit(e, dst, map, value) end
builtins[stack_id] = function (e, dst, map, key) return stack_id(e, dst, map, key) end
return builtins
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
local ffi = require('ffi')
local bit = require('bit')
local S = require('syscall')
local M = {}
ffi.cdef [[
struct bpf {
/* Instruction classes */
static const int LD = 0x00;
static const int LDX = 0x01;
static const int ST = 0x02;
static const int STX = 0x03;
static const int ALU = 0x04;
static const int JMP = 0x05;
static const int ALU64 = 0x07;
/* ld/ldx fields */
static const int W = 0x00;
static const int H = 0x08;
static const int B = 0x10;
static const int ABS = 0x20;
static const int IND = 0x40;
static const int MEM = 0x60;
static const int LEN = 0x80;
static const int MSH = 0xa0;
/* alu/jmp fields */
static const int ADD = 0x00;
static const int SUB = 0x10;
static const int MUL = 0x20;
static const int DIV = 0x30;
static const int OR = 0x40;
static const int AND = 0x50;
static const int LSH = 0x60;
static const int RSH = 0x70;
static const int NEG = 0x80;
static const int MOD = 0x90;
static const int XOR = 0xa0;
static const int JA = 0x00;
static const int JEQ = 0x10;
static const int JGT = 0x20;
static const int JGE = 0x30;
static const int JSET = 0x40;
static const int K = 0x00;
static const int X = 0x08;
static const int JNE = 0x50; /* jump != */
static const int JSGT = 0x60; /* SGT is signed '>', GT in x86 */
static const int JSGE = 0x70; /* SGE is signed '>=', GE in x86 */
static const int CALL = 0x80; /* function call */
static const int EXIT = 0x90; /* function return */
/* ld/ldx fields */
static const int DW = 0x18; /* double word */
static const int XADD = 0xc0; /* exclusive add */
/* alu/jmp fields */
static const int MOV = 0xb0; /* mov reg to reg */
static const int ARSH = 0xc0; /* sign extending arithmetic shift right */
/* change endianness of a register */
static const int END = 0xd0; /* flags for endianness conversion: */
static const int TO_LE = 0x00; /* convert to little-endian */
static const int TO_BE = 0x08; /* convert to big-endian */
/* misc */
static const int PSEUDO_MAP_FD = 0x01;
/* helper functions */
static const int F_CURRENT_CPU = 0xffffffff;
static const int F_USER_STACK = 1 << 8;
static const int F_FAST_STACK_CMP = 1 << 9;
static const int F_REUSE_STACKID = 1 << 10;
};
/* eBPF commands */
struct bpf_cmd {
static const int MAP_CREATE = 0;
static const int MAP_LOOKUP_ELEM = 1;
static const int MAP_UPDATE_ELEM = 2;
static const int MAP_DELETE_ELEM = 3;
static const int MAP_GET_NEXT_KEY = 4;
static const int PROG_LOAD = 5;
static const int OBJ_PIN = 6;
static const int OBJ_GET = 7;
};
/* eBPF helpers */
struct bpf_func_id {
static const int unspec = 0;
static const int map_lookup_elem = 1;
static const int map_update_elem = 2;
static const int map_delete_elem = 3;
static const int probe_read = 4;
static const int ktime_get_ns = 5;
static const int trace_printk = 6;
static const int get_prandom_u32 = 7;
static const int get_smp_processor_id = 8;
static const int skb_store_bytes = 9;
static const int l3_csum_replace = 10;
static const int l4_csum_replace = 11;
static const int tail_call = 12;
static const int clone_redirect = 13;
static const int get_current_pid_tgid = 14;
static const int get_current_uid_gid = 15;
static const int get_current_comm = 16;
static const int get_cgroup_classid = 17;
static const int skb_vlan_push = 18;
static const int skb_vlan_pop = 19;
static const int skb_get_tunnel_key = 20;
static const int skb_set_tunnel_key = 21;
static const int perf_event_read = 22;
static const int redirect = 23;
static const int get_route_realm = 24;
static const int perf_event_output = 25;
static const int skb_load_bytes = 26;
static const int get_stackid = 27;
};
/* BPF_MAP_STACK_TRACE structures and constants */
static const int BPF_MAX_STACK_DEPTH = 127;
struct bpf_stacktrace {
uint64_t ip[BPF_MAX_STACK_DEPTH];
};
]]
-- Compatibility: ljsyscall doesn't have support for BPF syscall
if not S.bpf then
error("ljsyscall doesn't support bpf(), must be updated")
else
-- Compatibility: ljsyscall<=0.12
if not S.c.BPF_MAP.PERCPU_HASH then
S.c.BPF_MAP.PERCPU_HASH = 5
S.c.BPF_MAP.PERCPU_ARRAY = 6
S.c.BPF_MAP.STACK_TRACE = 7
S.c.BPF_MAP.CGROUP_ARRAY = 8
end
if not S.c.BPF_PROG.TRACEPOINT then
S.c.BPF_PROG.TRACEPOINT = 5
end
end
-- Compatibility: metatype for stacktrace
local function stacktrace_iter(t, i)
i = i + 1
if i < #t and t.ip[i] > 0 then
return i, t.ip[i]
end
end
ffi.metatype('struct bpf_stacktrace', {
__len = function (t) return ffi.sizeof(t.ip) / ffi.sizeof(t.ip[0]) end,
__ipairs = function (t) return stacktrace_iter, t, -1 end,
})
-- Reflect cdata type
function M.typename(v)
if not v or type(v) ~= 'cdata' then return nil end
return string.match(tostring(ffi.typeof(v)), '<([^>]+)')
end
-- Reflect if cdata type can be pointer (accepts array or pointer)
function M.isptr(v, noarray)
local ctname = M.typename(v)
if ctname then
ctname = string.sub(ctname, -1)
ctname = ctname == '*' or (not noarray and ctname == ']')
end
return ctname
end
function M.osversion()
-- We have no better way to extract current kernel hex-string other
-- than parsing headers, compiling a helper function or reading /proc
local ver_str, count = S.sysctl('kernel.version'):match('%d+.%d+.%d+'), 2
if not ver_str then -- kernel.version is freeform, fallback to kernel.osrelease
ver_str = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+')
end
local version = 0
for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ
version = bit.bor(version, bit.lshift(tonumber(i), 8*count))
count = count - 1
end
return version
end
function M.event_reader(reader, event_type)
-- Caller can specify event message binary format
if event_type then
assert(type(event_type) == 'string' and ffi.typeof(event_type), 'not a valid type for event reader')
event_type = ffi.typeof(event_type .. '*') -- Convert type to pointer-to-type
end
-- Wrap reader in interface that can interpret read event messages
return setmetatable({reader=reader,type=event_type}, {__index = {
block = function(self)
return S.select { readfds = {reader.fd} }
end,
next = function(self, k)
local len, ev = reader:next(k)
-- Filter out only sample frames
while ev and ev.type ~= S.c.PERF_RECORD.SAMPLE do
len, ev = reader:next(len)
end
if ev and event_type then
-- The perf event reader returns framed data with header and variable length
-- This is going skip the frame header and cast data to given type
ev = ffi.cast(event_type, ffi.cast('char *', ev) + ffi.sizeof('struct perf_event_header') + ffi.sizeof('uint32_t'))
end
return len, ev
end,
read = function(self)
return self.next, self, nil
end,
}})
end
function M.tracepoint_type(tp)
-- Read tracepoint format string
local fp = assert(io.open('/sys/kernel/debug/tracing/events/'..tp..'/format', 'r'))
local fmt = fp:read '*a'
fp:close()
-- Parse struct fields
local fields = {}
for f in fmt:gmatch 'field:([^;]+;)' do
table.insert(fields, f)
end
return string.format('struct { %s }', table.concat(fields))
end
return M
\ No newline at end of file
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
-- This is a tiny wrapper over libelf to extract load address
-- and offsets of dynamic symbols
local S = require('syscall')
local ffi = require('ffi')
ffi.cdef [[
/* Type for a 16-bit quantity. */
typedef uint16_t Elf32_Half;
typedef uint16_t Elf64_Half;
/* Types for signed and unsigned 32-bit quantities. */
typedef uint32_t Elf32_Word;
typedef int32_t Elf32_Sword;
typedef uint32_t Elf64_Word;
typedef int32_t Elf64_Sword;
/* Types for signed and unsigned 64-bit quantities. */
typedef uint64_t Elf32_Xword;
typedef int64_t Elf32_Sxword;
typedef uint64_t Elf64_Xword;
typedef int64_t Elf64_Sxword;
/* Type of addresses. */
typedef uint32_t Elf32_Addr;
typedef uint64_t Elf64_Addr;
/* Type of file offsets. */
typedef uint32_t Elf32_Off;
typedef uint64_t Elf64_Off;
/* Type for section indices, which are 16-bit quantities. */
typedef uint16_t Elf32_Section;
typedef uint16_t Elf64_Section;
/* Constants */
struct Elf_Cmd
{
static const int READ = 1;
static const int RDWR = 2;
static const int WRITE = 3;
static const int CLR = 4;
static const int SET = 5;
static const int FDDONE = 6;
static const int FDREAD = 7;
static const int READ_MMAP = 8;
static const int RDWR_MMAP = 9;
static const int WRITE_MMAP =10;
static const int READ_MMAP_PRIVATE =11;
static const int EMPTY =12;
static const int NUM =13;
};
/* Descriptor for the ELF file. */
typedef struct Elf Elf;
/* Descriptor for ELF file section. */
typedef struct Elf_Scn Elf_Scn;
/* Container type for metatable */
struct Elf_object { int fd; Elf *elf; };
/* Program segment header. */
typedef struct
{
Elf64_Word p_type; /* Segment type */
Elf64_Word p_flags; /* Segment flags */
Elf64_Off p_offset; /* Segment file offset */
Elf64_Addr p_vaddr; /* Segment virtual address */
Elf64_Addr p_paddr; /* Segment physical address */
Elf64_Xword p_filesz; /* Segment size in file */
Elf64_Xword p_memsz; /* Segment size in memory */
Elf64_Xword p_align; /* Segment alignment */
} Elf64_Phdr;
typedef Elf64_Phdr GElf_Phdr;
/* Section header. */
typedef struct
{
Elf64_Word sh_name; /* Section name (string tbl index) */
Elf64_Word sh_type; /* Section type */
Elf64_Xword sh_flags; /* Section flags */
Elf64_Addr sh_addr; /* Section virtual addr at execution */
Elf64_Off sh_offset; /* Section file offset */
Elf64_Xword sh_size; /* Section size in bytes */
Elf64_Word sh_link; /* Link to another section */
Elf64_Word sh_info; /* Additional section information */
Elf64_Xword sh_addralign; /* Section alignment */
Elf64_Xword sh_entsize; /* Entry size if section holds table */
} Elf64_Shdr;
typedef Elf64_Shdr GElf_Shdr;
/* Descriptor for data to be converted to or from memory format. */
typedef struct
{
void *d_buf; /* Pointer to the actual data. */
int d_type; /* Type of this piece of data. */
unsigned int d_version; /* ELF version. */
size_t d_size; /* Size in bytes. */
uint64_t d_off; /* Offset into section. */
size_t d_align; /* Alignment in section. */
} Elf_Data;
/* Symbol table entry. */
typedef struct
{
Elf64_Word st_name; /* Symbol name (string tbl index) */
unsigned char st_info; /* Symbol type and binding */
unsigned char st_other; /* Symbol visibility */
Elf64_Section st_shndx; /* Section index */
Elf64_Addr st_value; /* Symbol value */
Elf64_Xword st_size; /* Symbol size */
} Elf64_Sym;
typedef Elf64_Sym GElf_Sym;
/* Coordinate ELF library and application versions. */
unsigned int elf_version (unsigned int __version);
/* Return descriptor for ELF file to work according to CMD. */
Elf *elf_begin (int __fildes, int __cmd, Elf *__ref);
/* Free resources allocated for ELF. */
int elf_end (Elf *__elf);
/* Get the number of program headers in the ELF file. If the file uses
more headers than can be represented in the e_phnum field of the ELF
header the information from the sh_info field in the zeroth section
header is used. */
int elf_getphdrnum (Elf *__elf, size_t *__dst);
/* Retrieve program header table entry. */
GElf_Phdr *gelf_getphdr (Elf *__elf, int __ndx, GElf_Phdr *__dst);
/* Retrieve section header. */
GElf_Shdr *gelf_getshdr (Elf_Scn *__scn, GElf_Shdr *__dst);
/* Retrieve symbol information from the symbol table at the given index. */
GElf_Sym *gelf_getsym (Elf_Data *__data, int __ndx, GElf_Sym *__dst);
/* Get section with next section index. */
Elf_Scn *elf_nextscn (Elf *__elf, Elf_Scn *__scn);
/* Get data from section while translating from file representation
to memory representation. */
Elf_Data *elf_getdata (Elf_Scn *__scn, Elf_Data *__data);
/* Return pointer to string at OFFSET in section INDEX. */
char *elf_strptr (Elf *__elf, size_t __index, size_t __offset);
]]
local elf = ffi.load('elf')
local EV = { NONE=0, CURRENT=1, NUM=2 }
local PT = { NULL=0, LOAD=1, DYNAMIC=2, INTERP=3, NOTE=4, SHLIB=5, PHDR=6, TLS=7, NUM=8 }
local SHT = { NULL=0, PROGBITS=1, SYMTAB=2, STRTAB=3, RELA=4, HASH=5, DYNAMIC=6, NOTE=7,
NOBITS=8, REL=9, SHLIB=10, DYNSYM=11, INIT_ARRAY=14, FINI_ARRAY=15, PREINIT_ARRAY=16,
GROUP=17, SYMTAB_SHNDX=18, NUM=19 }
local ELF_C = ffi.new('struct Elf_Cmd')
local M = {}
-- Optional poor man's C++ demangler
local cpp_demangler = os.getenv('CPP_DEMANGLER')
if not cpp_demangler then
for prefix in string.gmatch(os.getenv('PATH'), '[^;:]+') do
if S.statfs(prefix..'/c++filt') then
cpp_demangler = prefix..'/c++filt'
break
end
end
end
local cpp_demangle = function (name) return name end
if cpp_demangler then
cpp_demangle = function (name)
local cmd = string.format('%s -p %s', cpp_demangler, name)
local fp = assert(io.popen(cmd, 'r'))
local output = fp:read('*all')
fp:close()
return output:match '^(.-)%s*$'
end
end
-- Metatable for ELF object
ffi.metatype('struct Elf_object', {
__gc = function (t) t:close() end,
__index = {
close = function (t)
if t.elf ~= nil then
elf.elf_end(t.elf)
S.close(t.fd)
t.elf = nil
end
end,
-- Load library load address
loadaddr = function(t)
local phnum = ffi.new('size_t [1]')
if elf.elf_getphdrnum(t.elf, phnum) == nil then
return nil, 'cannot get phdrnum'
end
local header = ffi.new('GElf_Phdr [1]')
for i = 0, tonumber(phnum[0])-1 do
if elf.gelf_getphdr(t.elf, i, header) ~= nil
and header[0].p_type == PT.LOAD then
return header[0].p_vaddr
end
end
end,
-- Resolve symbol address
resolve = function (t, k, pattern)
local section = elf.elf_nextscn(t.elf, nil)
while section ~= nil do
local header = ffi.new('GElf_Shdr [1]')
if elf.gelf_getshdr(section, header) ~= nil then
if header[0].sh_type == SHT.SYMTAB or header[0].sh_type == SHT.DYNSYM then
local data = elf.elf_getdata(section, nil)
while data ~= nil do
if data.d_size % header[0].sh_entsize > 0 then
return nil, 'bad section header entity size'
end
local symcount = tonumber(data.d_size / header[0].sh_entsize)
local sym = ffi.new('GElf_Sym [1]')
for i = 0, symcount - 1 do
if elf.gelf_getsym(data, i, sym) ~= nil then
local name = elf.elf_strptr(t.elf, header[0].sh_link, sym[0].st_name)
if name ~= nil then
-- Demangle C++ symbols if necessary
name = ffi.string(name)
if name:sub(1,2) == '_Z' then
name = cpp_demangle(name)
end
-- Match symbol name against pattern
if pattern and string.match(name, k) or k == name then
return sym[0]
end
end
end
end
data = elf.elf_getdata(section, data)
end
end
end
section = elf.elf_nextscn(t.elf, section)
end
end,
}
})
-- Open an ELF object
function M.open(path)
if elf.elf_version(EV.CURRENT) == EV.NONE then
return nil, 'bad version'
end
local fd, err = S.open(path, 'rdonly')
if not fd then return nil, err end
local pt = ffi.new('Elf *')
pt = elf.elf_begin(fd:getfd(), ELF_C.READ, pt)
if not pt then
fd:close()
return nil, 'cannot open elf object'
end
return ffi.new('struct Elf_object', fd:nogc():getfd(), pt)
end
return M
\ No newline at end of file
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
return require('bpf.bpf')
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
local jutil = require("jit.util")
local vmdef = require("jit.vmdef")
local bit = require('bit')
local shr, band = bit.rshift, bit.band
-- Decode LuaJIT 2.0 Byte Format
-- Reference: http://wiki.luajit.org/Bytecode-2.0
-- Thanks to LJ, we get code in portable bytecode with constants folded, basic
-- virtual registers allocated etc.
-- No SSA IR, type inference or advanced optimizations because the code wasn't traced yet.
local function decode_ins(func, pc)
local ins, m = jutil.funcbc(func, pc)
if not ins then return nil end
local op, ma, mb, mc = band(ins, 0xff), band(m, 7), band(m, 15*8), band(m, 15*128)
local a, b, c, d = band(shr(ins, 8), 0xff), nil, nil, shr(ins, 16)
if mb ~= 0 then
d = band(d, 0xff)
b = shr(ins, 24)
end
if ma == 5 then -- BCMuv
a = jutil.funcuvname(func, a)
end
if mc == 13*128 then -- BCMjump
c = pc+d-0x7fff
elseif mc == 9*128 then -- BCMint
c = jutil.funck(func, d)
elseif mc == 10*128 then -- BCMstr
c = jutil.funck(func, -d-1)
elseif mc == 5*128 then -- BCMuv
c = jutil.funcuvname(func, d)
end
-- Convert version-specific opcode to string
op = 6*op
op = string.sub(vmdef.bcnames, op+1, op+6):match('[^%s]+')
return pc, op, a, b, c, d
end
-- Decoder closure
local function decoder(func)
local pc = 0
return function ()
pc = pc + 1
return decode_ins(func, pc)
end
end
-- Hexdump generated code
local function dump(func)
return require('jit.bc').dump(func)
end
return {
decode = decode_ins,
decoder = decoder,
dump = dump,
funcinfo = function (...) return jutil.funcinfo(...) end,
}
\ No newline at end of file
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
local ffi = require('ffi')
local BPF = ffi.typeof('struct bpf')
ffi.cdef [[
struct sk_buff {
uint32_t len;
uint32_t pkt_type;
uint32_t mark;
uint32_t queue_mapping;
uint32_t protocol;
uint32_t vlan_present;
uint32_t vlan_tci;
uint32_t vlan_proto;
uint32_t priority;
uint32_t ingress_ifindex;
uint32_t ifindex;
uint32_t tc_index;
uint32_t cb[5];
uint32_t hash;
uint32_t tc_classid;
};
struct eth_t {
uint8_t dst[6];
uint8_t src[6];
uint16_t type;
} __attribute__((packed));
struct dot1q_t {
uint16_t pri:3;
uint16_t cfi:1;
uint16_t vlanid:12;
uint16_t type;
} __attribute__((packed));
struct arp_t {
uint16_t htype;
uint16_t ptype;
uint8_t hlen;
uint8_t plen;
uint16_t oper;
uint8_t sha[6];
uint32_t spa;
uint8_t tha[6];
uint32_t tpa;
} __attribute__((packed));
struct ip_t {
uint8_t ver:4;
uint8_t hlen:4;
uint8_t tos;
uint16_t tlen;
uint16_t identification;
uint16_t ffo_unused:1;
uint16_t df:1;
uint16_t mf:1;
uint16_t foffset:13;
uint8_t ttl;
uint8_t proto;
uint16_t hchecksum;
uint32_t src;
uint32_t dst;
} __attribute__((packed));
struct icmp_t {
uint8_t type;
uint8_t code;
uint16_t checksum;
} __attribute__((packed));
struct ip6_t {
uint32_t ver:4;
uint32_t priority:8;
uint32_t flow_label:20;
uint16_t payload_len;
uint8_t next_header;
uint8_t hop_limit;
uint64_t src_hi;
uint64_t src_lo;
uint64_t dst_hi;
uint64_t dst_lo;
} __attribute__((packed));
struct ip6_opt_t {
uint8_t next_header;
uint8_t ext_len;
uint8_t pad[6];
} __attribute__((packed));
struct icmp6_t {
uint8_t type;
uint8_t code;
uint16_t checksum;
} __attribute__((packed));
struct udp_t {
uint16_t src_port;
uint16_t dst_port;
uint16_t length;
uint16_t crc;
} __attribute__((packed));
struct tcp_t {
uint16_t src_port;
uint16_t dst_port;
uint32_t seq_num;
uint32_t ack_num;
uint8_t offset:4;
uint8_t reserved:4;
uint8_t flag_cwr:1;
uint8_t flag_ece:1;
uint8_t flag_urg:1;
uint8_t flag_ack:1;
uint8_t flag_psh:1;
uint8_t flag_rst:1;
uint8_t flag_syn:1;
uint8_t flag_fin:1;
uint16_t rcv_wnd;
uint16_t cksum;
uint16_t urg_ptr;
} __attribute__((packed));
struct vxlan_t {
uint32_t rsv1:4;
uint32_t iflag:1;
uint32_t rsv2:3;
uint32_t rsv3:24;
uint32_t key:24;
uint32_t rsv4:8;
} __attribute__((packed));
]]
-- Architecture-specific ptrace register layout
local S = require('syscall')
local arch = S.abi.arch
local parm_to_reg = {}
if arch == 'x64' then
ffi.cdef [[
struct pt_regs {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
};]]
parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'}
else
ffi.cdef 'struct pt_regs {};'
end
-- Map symbolic registers to architecture ABI
ffi.metatype('struct pt_regs', {
__index = function (t,k)
return assert(parm_to_reg[k], 'no such register: '..k)
end,
})
local M = {}
-- Dissector interface
local function dissector(type, e, dst, src, field)
local parent = e.V[src].const
-- Create new dissector variable
e.vcopy(dst, src)
-- Compute and materialize new dissector offset from parent
e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector}
parent.__dissector[field](e, dst)
e.V[dst].const.__dissector = type
end
M.dissector = dissector
-- Get current effective offset, load field value at an offset relative to it and
-- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen)
local function next_offset(e, var, type, off, mask, shift)
local d = e.V[var].const
-- Materialize relative offset value in R0
local dst_reg, tmp_reg
if d.off then
dst_reg = e.vreg(var, 0, true)
tmp_reg = dst_reg -- Use target register to avoid copy
e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0)
else
tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset
dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var)
e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0)
end
-- Finalize relative offset
if mask then
e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask)
end
if shift then
local op = BPF.LSH
if shift < 0 then
op = BPF.RSH
shift = -shift
end
e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift)
end
-- Add to base offset to turn it into effective address
if dst_reg ~= tmp_reg then
e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0)
else
e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off)
end
-- Discard temporary allocations
d.off = nil
e.V[e.tmpvar].reg = nil
end
local function next_skip(e, var, off)
local d = e.V[var].const
if not d.off then
local dst_reg = e.vreg(var)
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off)
else
d.off = d.off + off
end
end
local function skip_eth(e, dst)
-- IP starts right after ETH header (fixed size)
local d = e.V[dst].const
d.off = d.off + ffi.sizeof('struct eth_t')
end
-- Export types
M.type = function(typestr, t)
t = t or {}
t.__dissector=ffi.typeof(typestr)
return t
end
M.skb = M.type('struct sk_buff', {__base=true})
M.pt_regs = M.type('struct pt_regs', {__base=true, source='probe'})
M.pkt = {off=0, __dissector=ffi.typeof('struct eth_t')} -- skb needs special accessors
-- M.eth = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end
M.dot1q = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end
M.arp = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end
M.icmp = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end
M.ip = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end
M.icmp6 = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end
M.ip6 = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end
M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end
M.udp = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end
M.tcp = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end
M.vxlan = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end
M.data = function (...) return dissector(ffi.typeof('uint8_t'), ...) end
-- Metatables
ffi.metatype(ffi.typeof('struct eth_t'), {
__index = {
ip = skip_eth,
ip6 = skip_eth,
}
})
ffi.metatype(ffi.typeof('struct ip_t'), {
__index = {
-- Skip IP header length (stored as number of words)
-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
-- Mask first nibble and shift by 2 (multiplication by 4)
icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
}
})
ffi.metatype(ffi.typeof('struct tcp_t'), {
__index = {
-- Skip TCP header length (stored as number of words)
-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
data = function(e, dst)
next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2)
end,
}
})
ffi.metatype(ffi.typeof('struct udp_t'), {
__index = {
-- Skip UDP header length (8 octets)
data = function(e, dst)
next_skip(e, dst, ffi.sizeof('struct udp_t'))
end,
}
})
-- Constants
M.c = {
eth = { -- Constants http://standards.ieee.org/regauth/ethertype
ip = 0x0800, -- IP (v4) protocol
ip6 = 0x86dd, -- IP (v6) protocol
arp = 0x0806, -- Address resolution protocol
revarp = 0x8035, -- Reverse addr resolution protocol
vlan = 0x8100, -- IEEE 802.1Q VLAN tagging
},
ip = {
-- Reserved Addresses
addr_any = 0x00000000, -- 0.0.0.0
addr_broadcast = 0xffffffff, -- 255.255.255.255
addr_loopback = 0x7f000001, -- 127.0.0.1
addr_mcast_all = 0xe0000001, -- 224.0.0.1
addr_mcast_local = 0xe00000ff, -- 224.0.0.255
-- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474")
tos_default = 0x00, -- default
tos_lowdelay = 0x10, -- low delay
tos_throughput = 0x08, -- high throughput
tos_reliability = 0x04, -- high reliability
tos_lowcost = 0x02, -- low monetary cost - XXX
tos_ect = 0x02, -- ECN-capable transport
tos_ce = 0x01, -- congestion experienced
-- Fragmentation flags (ip_off)
rf = 0x8000, -- reserved
df = 0x4000, -- don't fragment
mf = 0x2000, -- more fragments (not last frag)
offmask = 0x1fff, -- mask for fragment offset
-- Time-to-live (ip_ttl), seconds
ttl_default = 64, -- default ttl, RFC 1122, RFC 1340
ttl_max = 255, -- maximum ttl
-- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers
proto_ip = 0, -- dummy for IP
proto_hopopts = 0, -- IPv6 hop-by-hop options
proto_icmp = 1, -- ICMP
proto_igmp = 2, -- IGMP
proto_ggp = 3, -- gateway-gateway protocol
proto_ipip = 4, -- IP in IP
proto_st = 5, -- ST datagram mode
proto_tcp = 6, -- TCP
proto_cbt = 7, -- CBT
proto_egp = 8, -- exterior gateway protocol
proto_igp = 9, -- interior gateway protocol
proto_bbnrcc = 10, -- BBN RCC monitoring
proto_nvp = 11, -- Network Voice Protocol
proto_pup = 12, -- PARC universal packet
proto_argus = 13, -- ARGUS
proto_emcon = 14, -- EMCON
proto_xnet = 15, -- Cross Net Debugger
proto_chaos = 16, -- Chaos
proto_udp = 17, -- UDP
proto_mux = 18, -- multiplexing
proto_dcnmeas = 19, -- DCN measurement
proto_hmp = 20, -- Host Monitoring Protocol
proto_prm = 21, -- Packet Radio Measurement
proto_idp = 22, -- Xerox NS IDP
proto_trunk1 = 23, -- Trunk-1
proto_trunk2 = 24, -- Trunk-2
proto_leaf1 = 25, -- Leaf-1
proto_leaf2 = 26, -- Leaf-2
proto_rdp = 27, -- "Reliable Datagram" proto
proto_irtp = 28, -- Inet Reliable Transaction
proto_tp = 29, -- ISO TP class 4
proto_netblt = 30, -- Bulk Data Transfer
proto_mfpnsp = 31, -- MFE Network Services
proto_meritinp= 32, -- Merit Internodal Protocol
proto_sep = 33, -- Sequential Exchange proto
proto_3pc = 34, -- Third Party Connect proto
proto_idpr = 35, -- Interdomain Policy Route
proto_xtp = 36, -- Xpress Transfer Protocol
proto_ddp = 37, -- Datagram Delivery Proto
proto_cmtp = 38, -- IDPR Ctrl Message Trans
proto_tppp = 39, -- TP++ Transport Protocol
proto_il = 40, -- IL Transport Protocol
proto_ip6 = 41, -- IPv6
proto_sdrp = 42, -- Source Demand Routing
proto_routing = 43, -- IPv6 routing header
proto_fragment= 44, -- IPv6 fragmentation header
proto_rsvp = 46, -- Reservation protocol
proto_gre = 47, -- General Routing Encap
proto_mhrp = 48, -- Mobile Host Routing
proto_ena = 49, -- ENA
proto_esp = 50, -- Encap Security Payload
proto_ah = 51, -- Authentication Header
proto_inlsp = 52, -- Integated Net Layer Sec
proto_swipe = 53, -- SWIPE
proto_narp = 54, -- NBMA Address Resolution
proto_mobile = 55, -- Mobile IP, RFC 2004
proto_tlsp = 56, -- Transport Layer Security
proto_skip = 57, -- SKIP
proto_icmp6 = 58, -- ICMP for IPv6
proto_none = 59, -- IPv6 no next header
proto_dstopts = 60, -- IPv6 destination options
proto_anyhost = 61, -- any host internal proto
proto_cftp = 62, -- CFTP
proto_anynet = 63, -- any local network
proto_expak = 64, -- SATNET and Backroom EXPAK
proto_kryptolan = 65, -- Kryptolan
proto_rvd = 66, -- MIT Remote Virtual Disk
proto_ippc = 67, -- Inet Pluribus Packet Core
proto_distfs = 68, -- any distributed fs
proto_satmon = 69, -- SATNET Monitoring
proto_visa = 70, -- VISA Protocol
proto_ipcv = 71, -- Inet Packet Core Utility
proto_cpnx = 72, -- Comp Proto Net Executive
proto_cphb = 73, -- Comp Protocol Heart Beat
proto_wsn = 74, -- Wang Span Network
proto_pvp = 75, -- Packet Video Protocol
proto_brsatmon= 76, -- Backroom SATNET Monitor
proto_sunnd = 77, -- SUN ND Protocol
proto_wbmon = 78, -- WIDEBAND Monitoring
proto_wbexpak = 79, -- WIDEBAND EXPAK
proto_eon = 80, -- ISO CNLP
proto_vmtp = 81, -- Versatile Msg Transport
proto_svmtp = 82, -- Secure VMTP
proto_vines = 83, -- VINES
proto_ttp = 84, -- TTP
proto_nsfigp = 85, -- NSFNET-IGP
proto_dgp = 86, -- Dissimilar Gateway Proto
proto_tcf = 87, -- TCF
proto_eigrp = 88, -- EIGRP
proto_ospf = 89, -- Open Shortest Path First
proto_spriterpc= 90, -- Sprite RPC Protocol
proto_larp = 91, -- Locus Address Resolution
proto_mtp = 92, -- Multicast Transport Proto
proto_ax25 = 93, -- AX.25 Frames
proto_ipipencap= 94, -- yet-another IP encap
proto_micp = 95, -- Mobile Internet Ctrl
proto_sccsp = 96, -- Semaphore Comm Sec Proto
proto_etherip = 97, -- Ethernet in IPv4
proto_encap = 98, -- encapsulation header
proto_anyenc = 99, -- private encryption scheme
proto_gmtp = 100, -- GMTP
proto_ifmp = 101, -- Ipsilon Flow Mgmt Proto
proto_pnni = 102, -- PNNI over IP
proto_pim = 103, -- Protocol Indep Multicast
proto_aris = 104, -- ARIS
proto_scps = 105, -- SCPS
proto_qnx = 106, -- QNX
proto_an = 107, -- Active Networks
proto_ipcomp = 108, -- IP Payload Compression
proto_snp = 109, -- Sitara Networks Protocol
proto_compaqpeer= 110, -- Compaq Peer Protocol
proto_ipxip = 111, -- IPX in IP
proto_vrrp = 112, -- Virtual Router Redundancy
proto_pgm = 113, -- PGM Reliable Transport
proto_any0hop = 114, -- 0-hop protocol
proto_l2tp = 115, -- Layer 2 Tunneling Proto
proto_ddx = 116, -- D-II Data Exchange (DDX)
proto_iatp = 117, -- Interactive Agent Xfer
proto_stp = 118, -- Schedule Transfer Proto
proto_srp = 119, -- SpectraLink Radio Proto
proto_uti = 120, -- UTI
proto_smp = 121, -- Simple Message Protocol
proto_sm = 122, -- SM
proto_ptp = 123, -- Performance Transparency
proto_isis = 124, -- ISIS over IPv4
proto_fire = 125, -- FIRE
proto_crtp = 126, -- Combat Radio Transport
proto_crudp = 127, -- Combat Radio UDP
proto_sscopmce= 128, -- SSCOPMCE
proto_iplt = 129, -- IPLT
proto_sps = 130, -- Secure Packet Shield
proto_pipe = 131, -- Private IP Encap in IP
proto_sctp = 132, -- Stream Ctrl Transmission
proto_fc = 133, -- Fibre Channel
proto_rsvpign = 134, -- RSVP-E2E-IGNORE
proto_raw = 255, -- Raw IP packets
proto_reserved= 255, -- Reserved
},
}
return M
\ No newline at end of file
......@@ -13,5 +13,13 @@ Module "bcc.tracerpipe" "bcc/tracerpipe.lua"
Module "bcc.table" "bcc/table.lua"
Module "bcc.usdt" "bcc/usdt.lua"
Module "bpf" "bpf/init.lua"
Module "bpf.bpf" "bpf/bpf.lua"
Module "bpf.builtins" "bpf/builtins.lua"
Module "bpf.cdef" "bpf/cdef.lua"
Module "bpf.elf" "bpf/elf.lua"
Module "bpf.ljbytecode" "bpf/ljbytecode.lua"
Module "bpf.proto" "bpf/proto.lua"
Main "bcc/run.lua"
Output "bcc.lua"
-- Configuration for unit tests
-- See: http://olivinelabs.com/busted/
return {
default = {
lpath = "./?.lua",
["auto-insulate"] = false,
}
}
\ No newline at end of file
std = "luajit"
ignore = { "211", "212", "411", "412", "421", "431", "542" }
files["examples"] = {
new_globals = { "pkt", "time", "xadd", "c" }
}
files["bpf/builtins.lua"] = {
ignore = { "122" }
}
files["spec"] = {
std = "+busted",
new_globals = { "pkt", "time", "xadd", "c" }
}
\ No newline at end of file
find_program(LUAJIT luajit)
find_program(BUSTED busted)
if(LUAJIT)
add_test(NAME lua_test_clang WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
......@@ -12,4 +13,9 @@ if(LUAJIT)
add_test(NAME lua_test_standalone WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/test_standalone.sh)
if(BUSTED)
add_test(NAME lua_test_busted WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND busted --lua=${LUAJIT} -m "${CMAKE_CURRENT_SOURCE_DIR}/../../src/lua/?.lua" -m "${CMAKE_CURRENT_SOURCE_DIR}/../../src/lua/?/init.lua;")
endif()
endif()
# Unit test specs
This directory contains spec files for Lua BPF in [Busted] unit test format.
[Busted]: http://olivinelabs.com/busted/
describe('compile', function()
local ffi = require('ffi')
local bpf = require('bpf')
it('can compile socket filter', function()
-- Create mock BPF map
local mock_map = {
max_entries = 16,
key_type = ffi.typeof('uint64_t [1]'),
val_type = ffi.typeof('uint64_t [1]'),
fd = 1,
__map = true,
}
-- Compile small code example
local code = bpf(function ()
local proto = pkt.ip.proto
xadd(mock_map[proto], 1)
end)
assert.truthy(code)
assert.same(type(code), 'table')
assert.same(code.pc, 15)
end)
end)
describe('decoder', function()
-- Decode simple function
local bytecode = require('bpf.ljbytecode')
local f = function (x) return x + 1 end
it('should decode functions', function()
-- Make sure it calls LJ decoder
local bc = bytecode.decoder(f)
assert.truthy(bc)
-- Decode bytecode bytecode to instructions
local jutil = require("jit.util")
spy.on(jutil, 'funcbc')
local pc, op = bc()
-- Check bytecode for sanity (starts with ADDVN(x, 1))
assert.equal(pc, 1)
assert.equal(op, 'ADDVN')
for pc, op in bc do
assert.truthy(pc and op)
end
assert.spy(jutil.funcbc).was.called()
end)
it('should fail on bad input', function()
assert.has_error(function() bytecode.decoder(nil)() end)
assert.has_error(function() bytecode.decoder(5)() end)
assert.has_error(function() bytecode.decoder('test')() end)
end)
it('should dump bytecode', function()
bytecode.dump(f)
end)
end)
describe('elf reader', function()
local ok, elf = pcall(require, 'bpf.elf')
if not ok then return end
it('should handle C library', function()
-- Open libc
local sh = elf.open('/bin/sh')
assert.truthy(sh)
-- Find load address
local base = sh:loadaddr()
assert.truthy(base)
-- Find something from ISO C
local malloc_addr = sh:resolve('malloc')
assert.truthy(malloc_addr)
-- Find something that doesn't exist
local bad_addr = sh:resolve('thisnotexists')
assert.falsy(bad_addr)
end)
it('should fail on bad input', function()
assert.falsy(elf.open(nil))
assert.falsy(elf.open('/tmp'):loadaddr())
end)
end)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment