Commit 8ca47e00 authored by Rusty Russell's avatar Rusty Russell Committed by Linus Torvalds

lguest: the documentation, example launcher

A brief document describing how to use lguest.  Because lguest doesn't have an
ABI we also include an example launcher in the Documentation directory.

[jmorris@namei.org: Fix up nat example in documentation]
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: default avatarJames Morris <jmorris@namei.org>
Cc: Matias Zabaljauregui <matias.zabaljauregui@cern.ch>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent b754416b
# This creates the demonstration utility "lguest" which runs a Linux guest.
# For those people that have a separate object dir, look there for .config
KBUILD_OUTPUT := ../..
ifdef O
ifeq ("$(origin O)", "command line")
KBUILD_OUTPUT := $(O)
endif
endif
# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
include $(KBUILD_OUTPUT)/.config
LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
LDLIBS:=-lz
all: lguest.lds lguest
# The linker script on x86 is so complex the only way of creating one
# which will link our binary in the right place is to mangle the
# default one.
lguest.lds:
$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
clean:
rm -f lguest.lds lguest
/* Simple program to layout "physical" memory for new lguest guest.
* Linked high to avoid likely physical memory. */
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <err.h>
#include <stdint.h>
#include <stdlib.h>
#include <elf.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <stdbool.h>
#include <errno.h>
#include <ctype.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <time.h>
#include <netinet/in.h>
#include <net/if.h>
#include <linux/sockios.h>
#include <linux/if_tun.h>
#include <sys/uio.h>
#include <termios.h>
#include <getopt.h>
#include <zlib.h>
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
#include "../../include/linux/lguest_launcher.h"
#include "../../include/asm-i386/e820.h"
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
#define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
#endif
static bool verbose;
#define verbose(args...) \
do { if (verbose) printf(args); } while(0)
static int waker_fd;
struct device_list
{
fd_set infds;
int max_infd;
struct device *dev;
struct device **lastdev;
};
struct device
{
struct device *next;
struct lguest_device_desc *desc;
void *mem;
/* Watch this fd if handle_input non-NULL. */
int fd;
bool (*handle_input)(int fd, struct device *me);
/* Watch DMA to this key if handle_input non-NULL. */
unsigned long watch_key;
u32 (*handle_output)(int fd, const struct iovec *iov,
unsigned int num, struct device *me);
/* Device-specific data. */
void *priv;
};
static int open_or_die(const char *name, int flags)
{
int fd = open(name, flags);
if (fd < 0)
err(1, "Failed to open %s", name);
return fd;
}
static void *map_zeroed_pages(unsigned long addr, unsigned int num)
{
static int fd = -1;
if (fd == -1)
fd = open_or_die("/dev/zero", O_RDONLY);
if (mmap((void *)addr, getpagesize() * num,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
!= (void *)addr)
err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
return (void *)addr;
}
/* Find magic string marking entry point, return entry point. */
static unsigned long entry_point(void *start, void *end,
unsigned long page_offset)
{
void *p;
for (p = start; p < end; p++)
if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
return (long)p + strlen("GenuineLguest") + page_offset;
err(1, "Is this image a genuine lguest?");
}
/* Returns the entry point */
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
unsigned long *page_offset)
{
void *addr;
Elf32_Phdr phdr[ehdr->e_phnum];
unsigned int i;
unsigned long start = -1UL, end = 0;
/* Sanity checks. */
if (ehdr->e_type != ET_EXEC
|| ehdr->e_machine != EM_386
|| ehdr->e_phentsize != sizeof(Elf32_Phdr)
|| ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
errx(1, "Malformed elf header");
if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
err(1, "Seeking to program headers");
if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
err(1, "Reading program headers");
*page_offset = 0;
/* We map the loadable segments at virtual addresses corresponding
* to their physical addresses (our virtual == guest physical). */
for (i = 0; i < ehdr->e_phnum; i++) {
if (phdr[i].p_type != PT_LOAD)
continue;
verbose("Section %i: size %i addr %p\n",
i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
/* We expect linear address space. */
if (!*page_offset)
*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
errx(1, "Page offset of section %i different", i);
if (phdr[i].p_paddr < start)
start = phdr[i].p_paddr;
if (phdr[i].p_paddr + phdr[i].p_filesz > end)
end = phdr[i].p_paddr + phdr[i].p_filesz;
/* We map everything private, writable. */
addr = mmap((void *)phdr[i].p_paddr,
phdr[i].p_filesz,
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_FIXED|MAP_PRIVATE,
elf_fd, phdr[i].p_offset);
if (addr != (void *)phdr[i].p_paddr)
err(1, "Mmaping vmlinux seg %i gave %p not %p",
i, addr, (void *)phdr[i].p_paddr);
}
return entry_point((void *)start, (void *)end, *page_offset);
}
/* This is amazingly reliable. */
static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
{
unsigned int i, possibilities[256] = { 0 };
for (i = 0; i + 4 < len; i++) {
/* mov 0xXXXXXXXX,%eax */
if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
return (unsigned long)img[i+4] << 24;
}
errx(1, "could not determine page offset");
}
static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
{
gzFile f;
int ret, len = 0;
void *img = (void *)0x100000;
f = gzdopen(fd, "rb");
while ((ret = gzread(f, img + len, 65536)) > 0)
len += ret;
if (ret < 0)
err(1, "reading image from bzImage");
verbose("Unpacked size %i addr %p\n", len, img);
*page_offset = intuit_page_offset(img, len);
return entry_point(img, img + len, *page_offset);
}
static unsigned long load_bzimage(int fd, unsigned long *page_offset)
{
unsigned char c;
int state = 0;
/* Ugly brute force search for gzip header. */
while (read(fd, &c, 1) == 1) {
switch (state) {
case 0:
if (c == 0x1F)
state++;
break;
case 1:
if (c == 0x8B)
state++;
else
state = 0;
break;
case 2 ... 8:
state++;
break;
case 9:
lseek(fd, -10, SEEK_CUR);
if (c != 0x03) /* Compressed under UNIX. */
state = -1;
else
return unpack_bzimage(fd, page_offset);
}
}
errx(1, "Could not find kernel in bzImage");
}
static unsigned long load_kernel(int fd, unsigned long *page_offset)
{
Elf32_Ehdr hdr;
if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
err(1, "Reading kernel");
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
return map_elf(fd, &hdr, page_offset);
return load_bzimage(fd, page_offset);
}
static inline unsigned long page_align(unsigned long addr)
{
return ((addr + getpagesize()-1) & ~(getpagesize()-1));
}
/* initrd gets loaded at top of memory: return length. */
static unsigned long load_initrd(const char *name, unsigned long mem)
{
int ifd;
struct stat st;
unsigned long len;
void *iaddr;
ifd = open_or_die(name, O_RDONLY);
if (fstat(ifd, &st) < 0)
err(1, "fstat() on initrd '%s'", name);
len = page_align(st.st_size);
iaddr = mmap((void *)mem - len, st.st_size,
PROT_READ|PROT_EXEC|PROT_WRITE,
MAP_FIXED|MAP_PRIVATE, ifd, 0);
if (iaddr != (void *)mem - len)
err(1, "Mmaping initrd '%s' returned %p not %p",
name, iaddr, (void *)mem - len);
close(ifd);
verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
return len;
}
static unsigned long setup_pagetables(unsigned long mem,
unsigned long initrd_size,
unsigned long page_offset)
{
u32 *pgdir, *linear;
unsigned int mapped_pages, i, linear_pages;
unsigned int ptes_per_page = getpagesize()/sizeof(u32);
/* If we can map all of memory above page_offset, we do so. */
if (mem <= -page_offset)
mapped_pages = mem/getpagesize();
else
mapped_pages = -page_offset/getpagesize();
/* Each linear PTE page can map ptes_per_page pages. */
linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
/* We lay out top-level then linear mapping immediately below initrd */
pgdir = (void *)mem - initrd_size - getpagesize();
linear = (void *)pgdir - linear_pages*getpagesize();
for (i = 0; i < mapped_pages; i++)
linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
/* Now set up pgd so that this memory is at page_offset */
for (i = 0; i < mapped_pages; i += ptes_per_page) {
pgdir[(i + page_offset/getpagesize())/ptes_per_page]
= (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
}
verbose("Linear mapping of %u pages in %u pte pages at %p\n",
mapped_pages, linear_pages, linear);
return (unsigned long)pgdir;
}
static void concat(char *dst, char *args[])
{
unsigned int i, len = 0;
for (i = 0; args[i]; i++) {
strcpy(dst+len, args[i]);
strcat(dst+len, " ");
len += strlen(args[i]) + 1;
}
/* In case it's empty. */
dst[len] = '\0';
}
static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
{
u32 args[] = { LHREQ_INITIALIZE,
LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
pgdir, start, page_offset };
int fd;
fd = open_or_die("/dev/lguest", O_RDWR);
if (write(fd, args, sizeof(args)) < 0)
err(1, "Writing to /dev/lguest");
return fd;
}
static void set_fd(int fd, struct device_list *devices)
{
FD_SET(fd, &devices->infds);
if (fd > devices->max_infd)
devices->max_infd = fd;
}
/* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */
static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
{
set_fd(pipefd, devices);
for (;;) {
fd_set rfds = devices->infds;
u32 args[] = { LHREQ_BREAK, 1 };
select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
if (FD_ISSET(pipefd, &rfds)) {
int ignorefd;
if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
exit(0);
FD_CLR(ignorefd, &devices->infds);
} else
write(lguest_fd, args, sizeof(args));
}
}
static int setup_waker(int lguest_fd, struct device_list *device_list)
{
int pipefd[2], child;
pipe(pipefd);
child = fork();
if (child == -1)
err(1, "forking");
if (child == 0) {
close(pipefd[1]);
wake_parent(pipefd[0], lguest_fd, device_list);
}
close(pipefd[0]);
return pipefd[1];
}
static void *_check_pointer(unsigned long addr, unsigned int size,
unsigned int line)
{
if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP)
errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
return (void *)addr;
}
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
/* Returns pointer to dma->used_len */
static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
{
unsigned int i;
struct lguest_dma *udma;
udma = check_pointer(dma, sizeof(*udma));
for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
if (!udma->len[i])
break;
iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
iov[i].iov_len = udma->len[i];
}
*num = i;
return &udma->used_len;
}
static u32 *get_dma_buffer(int fd, void *key,
struct iovec iov[], unsigned int *num, u32 *irq)
{
u32 buf[] = { LHREQ_GETDMA, (u32)key };
unsigned long udma;
u32 *res;
udma = write(fd, buf, sizeof(buf));
if (udma == (unsigned long)-1)
return NULL;
/* Kernel stashes irq in ->used_len. */
res = dma2iov(udma, iov, num);
*irq = *res;
return res;
}
static void trigger_irq(int fd, u32 irq)
{
u32 buf[] = { LHREQ_IRQ, irq };
if (write(fd, buf, sizeof(buf)) != 0)
err(1, "Triggering irq %i", irq);
}
static void discard_iovec(struct iovec *iov, unsigned int *num)
{
static char discard_buf[1024];
*num = 1;
iov->iov_base = discard_buf;
iov->iov_len = sizeof(discard_buf);
}
static struct termios orig_term;
static void restore_term(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
}
struct console_abort
{
int count;
struct timeval start;
};
/* We DMA input to buffer bound at start of console page. */
static bool handle_console_input(int fd, struct device *dev)
{
u32 irq = 0, *lenp;
int len;
unsigned int num;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
struct console_abort *abort = dev->priv;
lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
if (!lenp) {
warn("console: no dma buffer!");
discard_iovec(iov, &num);
}
len = readv(dev->fd, iov, num);
if (len <= 0) {
warnx("Failed to get console input, ignoring console.");
len = 0;
}
if (lenp) {
*lenp = len;
trigger_irq(fd, irq);
}
/* Three ^C within one second? Exit. */
if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
if (!abort->count++)
gettimeofday(&abort->start, NULL);
else if (abort->count == 3) {
struct timeval now;
gettimeofday(&now, NULL);
if (now.tv_sec <= abort->start.tv_sec+1) {
/* Make sure waker is not blocked in BREAK */
u32 args[] = { LHREQ_BREAK, 0 };
close(waker_fd);
write(fd, args, sizeof(args));
exit(2);
}
abort->count = 0;
}
} else
abort->count = 0;
if (!len) {
restore_term();
return false;
}
return true;
}
static u32 handle_console_output(int fd, const struct iovec *iov,
unsigned num, struct device*dev)
{
return writev(STDOUT_FILENO, iov, num);
}
static u32 handle_tun_output(int fd, const struct iovec *iov,
unsigned num, struct device *dev)
{
/* Now we've seen output, we should warn if we can't get buffers. */
*(bool *)dev->priv = true;
return writev(dev->fd, iov, num);
}
static unsigned long peer_offset(unsigned int peernum)
{
return 4 * peernum;
}
static bool handle_tun_input(int fd, struct device *dev)
{
u32 irq = 0, *lenp;
int len;
unsigned num;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
&irq);
if (!lenp) {
if (*(bool *)dev->priv)
warn("network: no dma buffer!");
discard_iovec(iov, &num);
}
len = readv(dev->fd, iov, num);
if (len <= 0)
err(1, "reading network");
if (lenp) {
*lenp = len;
trigger_irq(fd, irq);
}
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
lenp ? "sent" : "discarded");
return true;
}
static u32 handle_block_output(int fd, const struct iovec *iov,
unsigned num, struct device *dev)
{
struct lguest_block_page *p = dev->mem;
u32 irq, *lenp;
unsigned int len, reply_num;
struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
off64_t device_len, off = (off64_t)p->sector * 512;
device_len = *(off64_t *)dev->priv;
if (off >= device_len)
err(1, "Bad offset %llu vs %llu", off, device_len);
if (lseek64(dev->fd, off, SEEK_SET) != off)
err(1, "Bad seek to sector %i", p->sector);
verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
if (!lenp)
err(1, "Block request didn't give us a dma buffer");
if (p->type) {
len = writev(dev->fd, iov, num);
if (off + len > device_len) {
ftruncate(dev->fd, device_len);
errx(1, "Write past end %llu+%u", off, len);
}
*lenp = 0;
} else {
len = readv(dev->fd, reply, reply_num);
*lenp = len;
}
p->result = 1 + (p->bytes != len);
trigger_irq(fd, irq);
return 0;
}
static void handle_output(int fd, unsigned long dma, unsigned long key,
struct device_list *devices)
{
struct device *i;
u32 *lenp;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
unsigned num = 0;
lenp = dma2iov(dma, iov, &num);
for (i = devices->dev; i; i = i->next) {
if (i->handle_output && key == i->watch_key) {
*lenp = i->handle_output(fd, iov, num, i);
return;
}
}
warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
}
static void handle_input(int fd, struct device_list *devices)
{
struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
for (;;) {
struct device *i;
fd_set fds = devices->infds;
if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
break;
for (i = devices->dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
if (!i->handle_input(fd, i)) {
FD_CLR(i->fd, &devices->infds);
/* Tell waker to ignore it too... */
write(waker_fd, &i->fd, sizeof(i->fd));
}
}
}
}
}
static struct lguest_device_desc *new_dev_desc(u16 type, u16 features,
u16 num_pages)
{
static unsigned long top = LGUEST_GUEST_TOP;
struct lguest_device_desc *desc;
desc = malloc(sizeof(*desc));
desc->type = type;
desc->num_pages = num_pages;
desc->features = features;
desc->status = 0;
if (num_pages) {
top -= num_pages*getpagesize();
map_zeroed_pages(top, num_pages);
desc->pfn = top / getpagesize();
} else
desc->pfn = 0;
return desc;
}
static struct device *new_device(struct device_list *devices,
u16 type, u16 num_pages, u16 features,
int fd,
bool (*handle_input)(int, struct device *),
unsigned long watch_off,
u32 (*handle_output)(int,
const struct iovec *,
unsigned,
struct device *))
{
struct device *dev = malloc(sizeof(*dev));
/* Append to device list. */
*devices->lastdev = dev;
dev->next = NULL;
devices->lastdev = &dev->next;
dev->fd = fd;
if (handle_input)
set_fd(dev->fd, devices);
dev->desc = new_dev_desc(type, features, num_pages);
dev->mem = (void *)(dev->desc->pfn * getpagesize());
dev->handle_input = handle_input;
dev->watch_key = (unsigned long)dev->mem + watch_off;
dev->handle_output = handle_output;
return dev;
}
static void setup_console(struct device_list *devices)
{
struct device *dev;
if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
struct termios term = orig_term;
term.c_lflag &= ~(ISIG|ICANON|ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &term);
atexit(restore_term);
}
/* We don't currently require a page for the console. */
dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
STDIN_FILENO, handle_console_input,
LGUEST_CONSOLE_DMA_KEY, handle_console_output);
dev->priv = malloc(sizeof(struct console_abort));
((struct console_abort *)dev->priv)->count = 0;
verbose("device %p: console\n",
(void *)(dev->desc->pfn * getpagesize()));
}
static void setup_block_file(const char *filename, struct device_list *devices)
{
int fd;
struct device *dev;
off64_t *device_len;
struct lguest_block_page *p;
fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
LGUEST_DEVICE_F_RANDOMNESS,
fd, NULL, 0, handle_block_output);
device_len = dev->priv = malloc(sizeof(*device_len));
*device_len = lseek64(fd, 0, SEEK_END);
p = dev->mem;
p->num_sectors = *device_len/512;
verbose("device %p: block %i sectors\n",
(void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
}
/* We use fnctl locks to reserve network slots (autocleanup!) */
static unsigned int find_slot(int netfd, const char *filename)
{
struct flock fl;
fl.l_type = F_WRLCK;
fl.l_whence = SEEK_SET;
fl.l_len = 1;
for (fl.l_start = 0;
fl.l_start < getpagesize()/sizeof(struct lguest_net);
fl.l_start++) {
if (fcntl(netfd, F_SETLK, &fl) == 0)
return fl.l_start;
}
errx(1, "No free slots in network file %s", filename);
}
static void setup_net_file(const char *filename,
struct device_list *devices)
{
int netfd;
struct device *dev;
netfd = open(filename, O_RDWR, 0);
if (netfd < 0) {
if (errno == ENOENT) {
netfd = open(filename, O_RDWR|O_CREAT, 0600);
if (netfd >= 0) {
char page[getpagesize()];
memset(page, 0, sizeof(page));
write(netfd, page, sizeof(page));
}
}
if (netfd < 0)
err(1, "cannot open net file '%s'", filename);
}
dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
-1, NULL, 0, NULL);
/* We overwrite the /dev/zero mapping with the actual file. */
if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
err(1, "could not mmap '%s'", filename);
verbose("device %p: shared net %s, peer %i\n",
(void *)(dev->desc->pfn * getpagesize()), filename,
dev->desc->features & ~LGUEST_NET_F_NOCSUM);
}
static u32 str2ip(const char *ipaddr)
{
unsigned int byte[4];
sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
}
/* adapted from libbridge */
static void add_to_bridge(int fd, const char *if_name, const char *br_name)
{
int ifidx;
struct ifreq ifr;
if (!*br_name)
errx(1, "must specify bridge name");
ifidx = if_nametoindex(if_name);
if (!ifidx)
errx(1, "interface %s does not exist!", if_name);
strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
ifr.ifr_ifindex = ifidx;
if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
err(1, "can't add %s to bridge %s", if_name, br_name);
}
static void configure_device(int fd, const char *devname, u32 ipaddr,
unsigned char hwaddr[6])
{
struct ifreq ifr;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
memset(&ifr, 0, sizeof(ifr));
strcpy(ifr.ifr_name, devname);
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = htonl(ipaddr);
if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
err(1, "Setting %s interface address", devname);
ifr.ifr_flags = IFF_UP;
if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
err(1, "Bringing interface %s up", devname);
if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
err(1, "getting hw address for %s", devname);
memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
}
static void setup_tun_net(const char *arg, struct device_list *devices)
{
struct device *dev;
struct ifreq ifr;
int netfd, ipfd;
u32 ip;
const char *br_name = NULL;
netfd = open_or_die("/dev/net/tun", O_RDWR);
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
strcpy(ifr.ifr_name, "tap%d");
if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
err(1, "configuring /dev/net/tun");
ioctl(netfd, TUNSETNOCSUM, 1);
/* You will be peer 1: we should create enough jitter to randomize */
dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
handle_tun_input, peer_offset(0), handle_tun_output);
dev->priv = malloc(sizeof(bool));
*(bool *)dev->priv = false;
ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
if (ipfd < 0)
err(1, "opening IP socket");
if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
ip = INADDR_ANY;
br_name = arg + strlen(BRIDGE_PFX);
add_to_bridge(ipfd, ifr.ifr_name, br_name);
} else
ip = str2ip(arg);
/* We are peer 0, ie. first slot. */
configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
/* Set "promisc" bit: we want every single packet. */
*((u8 *)dev->mem) |= 0x1;
close(ipfd);
verbose("device %p: tun net %u.%u.%u.%u\n",
(void *)(dev->desc->pfn * getpagesize()),
(u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
if (br_name)
verbose("attached to bridge: %s\n", br_name);
}
/* Now we know how much memory we have, we copy in device descriptors */
static void map_device_descriptors(struct device_list *devs, unsigned long mem)
{
struct device *i;
unsigned int num;
struct lguest_device_desc *descs;
/* Device descriptor array sits just above top of normal memory */
descs = map_zeroed_pages(mem, 1);
for (i = devs->dev, num = 0; i; i = i->next, num++) {
if (num == LGUEST_MAX_DEVICES)
errx(1, "too many devices");
verbose("Device %i: %s\n", num,
i->desc->type == LGUEST_DEVICE_T_NET ? "net"
: i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console"
: i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block"
: "unknown");
descs[num] = *i->desc;
free(i->desc);
i->desc = &descs[num];
}
}
static void __attribute__((noreturn))
run_guest(int lguest_fd, struct device_list *device_list)
{
for (;;) {
u32 args[] = { LHREQ_BREAK, 0 };
unsigned long arr[2];
int readval;
/* We read from the /dev/lguest device to run the Guest. */
readval = read(lguest_fd, arr, sizeof(arr));
if (readval == sizeof(arr)) {
handle_output(lguest_fd, arr[0], arr[1], device_list);
continue;
} else if (errno == ENOENT) {
char reason[1024] = { 0 };
read(lguest_fd, reason, sizeof(reason)-1);
errx(1, "%s", reason);
} else if (errno != EAGAIN)
err(1, "Running guest failed");
handle_input(lguest_fd, device_list);
if (write(lguest_fd, args, sizeof(args)) < 0)
err(1, "Resetting break");
}
}
static struct option opts[] = {
{ "verbose", 0, NULL, 'v' },
{ "sharenet", 1, NULL, 's' },
{ "tunnet", 1, NULL, 't' },
{ "block", 1, NULL, 'b' },
{ "initrd", 1, NULL, 'i' },
{ NULL },
};
static void usage(void)
{
errx(1, "Usage: lguest [--verbose] "
"[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
"|--block=<filename>|--initrd=<filename>]...\n"
"<mem-in-mb> vmlinux [args...]");
}
int main(int argc, char *argv[])
{
unsigned long mem, pgdir, start, page_offset, initrd_size = 0;
int c, lguest_fd;
struct device_list device_list;
void *boot = (void *)0;
const char *initrd_name = NULL;
device_list.max_infd = -1;
device_list.dev = NULL;
device_list.lastdev = &device_list.dev;
FD_ZERO(&device_list.infds);
while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
switch (c) {
case 'v':
verbose = true;
break;
case 's':
setup_net_file(optarg, &device_list);
break;
case 't':
setup_tun_net(optarg, &device_list);
break;
case 'b':
setup_block_file(optarg, &device_list);
break;
case 'i':
initrd_name = optarg;
break;
default:
warnx("Unknown argument %s", argv[optind]);
usage();
}
}
if (optind + 2 > argc)
usage();
/* We need a console device */
setup_console(&device_list);
/* First we map /dev/zero over all of guest-physical memory. */
mem = atoi(argv[optind]) * 1024 * 1024;
map_zeroed_pages(0, mem / getpagesize());
/* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
&page_offset);
/* Write the device descriptors into memory. */
map_device_descriptors(&device_list, mem);
/* Map the initrd image if requested */
if (initrd_name) {
initrd_size = load_initrd(initrd_name, mem);
*(unsigned long *)(boot+0x218) = mem - initrd_size;
*(unsigned long *)(boot+0x21c) = initrd_size;
*(unsigned char *)(boot+0x210) = 0xFF;
}
/* Set up the initial linar pagetables. */
pgdir = setup_pagetables(mem, initrd_size, page_offset);
/* E820 memory map: ours is a simple, single region. */
*(char*)(boot+E820NR) = 1;
*((struct e820entry *)(boot+E820MAP))
= ((struct e820entry) { 0, mem, E820_RAM });
/* Command line pointer and command line (at 4096) */
*(void **)(boot + 0x228) = boot + 4096;
concat(boot + 4096, argv+optind+2);
/* Paravirt type: 1 == lguest */
*(int *)(boot + 0x23c) = 1;
lguest_fd = tell_kernel(pgdir, start, page_offset);
waker_fd = setup_waker(lguest_fd, &device_list);
run_guest(lguest_fd, &device_list);
}
Rusty's Remarkably Unreliable Guide to Lguest
- or, A Young Coder's Illustrated Hypervisor
http://lguest.ozlabs.org
Lguest is designed to be a minimal hypervisor for the Linux kernel, for
Linux developers and users to experiment with virtualization with the
minimum of complexity. Nonetheless, it should have sufficient
features to make it useful for specific tasks, and, of course, you are
encouraged to fork and enhance it.
Features:
- Kernel module which runs in a normal kernel.
- Simple I/O model for communication.
- Simple program to create new guests.
- Logo contains cute puppies: http://lguest.ozlabs.org
Developer features:
- Fun to hack on.
- No ABI: being tied to a specific kernel anyway, you can change anything.
- Many opportunities for improvement or feature implementation.
Running Lguest:
- Lguest runs the same kernel as guest and host. You can configure
them differently, but usually it's easiest not to.
You will need to configure your kernel with the following options:
CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
CONFIG_LGUEST=y/m ("Linux hypervisor example code")
and I recommend:
CONFIG_HZ=100 ("Timer frequency")[2]
- A tool called "lguest" is available in this directory: type "make"
to build it. If you didn't build your kernel in-tree, use "make
O=<builddir>".
- Create or find a root disk image. There are several useful ones
around, such as the xm-test tiny root image at
http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
For more serious work, I usually use a distribution ISO image and
install it under qemu, then make multiple copies:
dd if=/dev/zero of=rootfile bs=1M count=2048
qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
- "modprobe lg" if you built it as a module.
- Run an lguest as root:
Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
Explanation:
64m: the amount of memory to use.
vmlinux: the kernel image found in the top of your build directory. You
can also use a standard bzImage.
--tunnet=192.168.19.1: configures a "tap" device for networking with this
IP address.
--block=rootfile: a file or block device which becomes /dev/lgba
inside the guest.
root=/dev/lgba: this (and anything else on the command line) are
kernel boot parameters.
- Configuring networking. I usually have the host masquerade, using
"iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
/proc/sys/net/ipv4/ip_forward". In this example, I would configure
eth0 inside the guest at 192.168.19.2.
Another method is to bridge the tap device to an external interface
using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
to obtain an IP address. The bridge needs to be configured first:
this option simply adds the tap interface to it.
A simple example on my system:
ifconfig eth0 0.0.0.0
brctl addbr lg0
ifconfig lg0 up
brctl addif lg0 eth0
dhclient lg0
Then use --tunnet=bridge:lg0 when launching the guest.
See http://linux-net.osdl.org/index.php/Bridge for general information
on how to get bridging working.
- You can also create an inter-guest network using
"--sharenet=<filename>": any two guests using the same file are on
the same network. This file is created if it does not exist.
Lguest I/O model:
Lguest uses a simplified DMA model plus shared memory for I/O. Guests
can communicate with each other if they share underlying memory
(usually by the lguest program mmaping the same file), but they can
use any non-shared memory to communicate with the lguest process.
Guests can register DMA buffers at any key (must be a valid physical
address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
hypercall. "dmabufs" is the physical address of an array of "num"
"struct lguest_dma": each contains a used_len, and an array of
physical addresses and lengths. When a transfer occurs, the
"used_len" field of one of the buffers which has used_len 0 will be
set to the length transferred and the irq will fire.
Using an irq value of 0 unbinds the dma buffers.
To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
and the bytes used is written to the used_len field. This can be 0 if
noone else has bound a DMA buffer to that key or some other error.
DMA buffers bound by the same guest are ignored.
Cheers!
Rusty Russell rusty@rustcorp.com.au.
[1] These are on various places on the TODO list, waiting for you to
get annoyed enough at the limitation to fix it.
[2] Lguest is not yet tickless when idle. See [1].
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment