Commit 10682799 authored by Shane Hansen's avatar Shane Hansen Committed by David Symonds

archive/tar: read/write extended pax/gnu tar archives

Support reading pax archives and applying extended attributes
like long names, subsecond mtime resolutions, etc. Default to
writing pax archives for long file and link names.
Support reading gnu archives using the ././@LongLink extended
header for file name and link target.

Fixes #3300

R=dsymonds, dave, rogpeppe, remyoudompheng, chressie, rsc
CC=golang-dev
https://golang.org/cl/6700047
parent fece09e5
......@@ -9,6 +9,7 @@
// References:
// http://www.freebsd.org/cgi/man.cgi?query=tar&sektion=5
// http://www.gnu.org/software/tar/manual/html_node/Standard.html
// http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html
package tar
import (
......@@ -33,6 +34,8 @@ const (
TypeCont = '7' // reserved
TypeXHeader = 'x' // extended header
TypeXGlobalHeader = 'g' // global extended header
TypeGNULongName = 'L' // Next file has a long name
TypeGNULongLink = 'K' // Next file symlinks to a file w/ a long name
)
// A Header represents a single header in a tar archive.
......@@ -54,6 +57,12 @@ type Header struct {
ChangeTime time.Time // status change time
}
// File name constants from the tar spec.
const (
fileNameSize = 100 // Maximum number of bytes in a standard tar name.
fileNamePrefixSize = 155 // Maximum number of ustar extension bytes.
)
// sysStat, if non-nil, populates h from system-dependent fields of fi.
var sysStat func(fi os.FileInfo, h *Header) error
......
......@@ -14,6 +14,7 @@ import (
"io/ioutil"
"os"
"strconv"
"strings"
"time"
)
......@@ -21,6 +22,8 @@ var (
ErrHeader = errors.New("archive/tar: invalid tar header")
)
const maxNanoSecondIntSize = 9
// A Reader provides sequential access to the contents of a tar archive.
// A tar archive consists of a sequence of files.
// The Next method advances to the next file in the archive (including the first),
......@@ -41,13 +44,183 @@ func (tr *Reader) Next() (*Header, error) {
if tr.err == nil {
tr.skipUnread()
}
if tr.err == nil {
if tr.err != nil {
return hdr, tr.err
}
hdr = tr.readHeader()
if hdr == nil {
return hdr, tr.err
}
// Check for PAX/GNU header.
switch hdr.Typeflag {
case TypeXHeader:
// PAX extended header
headers, err := parsePAX(tr)
if err != nil {
return nil, err
}
// We actually read the whole file,
// but this skips alignment padding
tr.skipUnread()
hdr = tr.readHeader()
mergePAX(hdr, headers)
return hdr, nil
case TypeGNULongName:
// We have a GNU long name header. Its contents are the real file name.
realname, err := ioutil.ReadAll(tr)
if err != nil {
return nil, err
}
hdr, err := tr.Next()
hdr.Name = cString(realname)
return hdr, err
case TypeGNULongLink:
// We have a GNU long link header.
realname, err := ioutil.ReadAll(tr)
if err != nil {
return nil, err
}
hdr, err := tr.Next()
hdr.Linkname = cString(realname)
return hdr, err
}
return hdr, tr.err
}
// Parse bytes as a NUL-terminated C-style string.
// mergePAX merges well known headers according to PAX standard.
// In general headers with the same name as those found
// in the header struct overwrite those found in the header
// struct with higher precision or longer values. Esp. useful
// for name and linkname fields.
func mergePAX(hdr *Header, headers map[string]string) error {
for k, v := range headers {
switch k {
case "path":
hdr.Name = v
case "linkpath":
hdr.Linkname = v
case "gname":
hdr.Gname = v
case "uname":
hdr.Uname = v
case "uid":
uid, err := strconv.ParseInt(v, 10, 0)
if err != nil {
return err
}
hdr.Uid = int(uid)
case "gid":
gid, err := strconv.ParseInt(v, 10, 0)
if err != nil {
return err
}
hdr.Gid = int(gid)
case "atime":
t, err := parsePAXTime(v)
if err != nil {
return err
}
hdr.AccessTime = t
case "mtime":
t, err := parsePAXTime(v)
if err != nil {
return err
}
hdr.ModTime = t
case "ctime":
t, err := parsePAXTime(v)
if err != nil {
return err
}
hdr.ChangeTime = t
case "size":
size, err := strconv.ParseInt(v, 10, 0)
if err != nil {
return err
}
hdr.Size = int64(size)
}
}
return nil
}
// parsePAXTime takes a string of the form %d.%d as described in
// the PAX specification.
func parsePAXTime(t string) (time.Time, error) {
buf := []byte(t)
pos := bytes.IndexByte(buf, '.')
var seconds, nanoseconds int64
var err error
if pos == -1 {
seconds, err = strconv.ParseInt(t, 10, 0)
if err != nil {
return time.Time{}, err
}
} else {
seconds, err = strconv.ParseInt(string(buf[:pos]), 10, 0)
if err != nil {
return time.Time{}, err
}
nano_buf := string(buf[pos+1:])
// Pad as needed before converting to a decimal.
// For example .030 -> .030000000 -> 30000000 nanoseconds
if len(nano_buf) < maxNanoSecondIntSize {
// Right pad
nano_buf += strings.Repeat("0", maxNanoSecondIntSize-len(nano_buf))
} else if len(nano_buf) > maxNanoSecondIntSize {
// Right truncate
nano_buf = nano_buf[:maxNanoSecondIntSize]
}
nanoseconds, err = strconv.ParseInt(string(nano_buf), 10, 0)
if err != nil {
return time.Time{}, err
}
}
ts := time.Unix(seconds, nanoseconds)
return ts, nil
}
// parsePAX parses PAX headers.
// If an extended header (type 'x') is invalid, ErrHeader is returned
func parsePAX(r io.Reader) (map[string]string, error) {
buf, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
headers := make(map[string]string)
// Each record is constructed as
// "%d %s=%s\n", length, keyword, value
for len(buf) > 0 {
// or the header was empty to start with.
var sp int
// The size field ends at the first space.
sp = bytes.IndexByte(buf, ' ')
if sp == -1 {
return nil, ErrHeader
}
// Parse the first token as a decimal integer.
n, err := strconv.ParseInt(string(buf[:sp]), 10, 0)
if err != nil {
return nil, ErrHeader
}
// Extract everything between the decimal and the n -1 on the
// beginning to to eat the ' ', -1 on the end to skip the newline.
var record []byte
record, buf = buf[sp+1:n-1], buf[n:]
// The first equals is guaranteed to mark the end of the key.
// Everything else is value.
eq := bytes.IndexByte(record, '=')
if eq == -1 {
return nil, ErrHeader
}
key, value := record[:eq], record[eq+1:]
headers[string(key)] = string(value)
}
return headers, nil
}
// cString parses bytes as a NUL-terminated C-style string.
// If a NUL byte is not found then the whole slice is returned as a string.
func cString(b []byte) string {
n := 0
......@@ -85,7 +258,7 @@ func (tr *Reader) octal(b []byte) int64 {
return int64(x)
}
// Skip any unread bytes in the existing file entry, as well as any alignment padding.
// skipUnread skips any unread bytes in the existing file entry, as well as any alignment padding.
func (tr *Reader) skipUnread() {
nr := tr.nb + tr.pad // number of bytes to skip
tr.nb, tr.pad = 0, 0
......
......@@ -10,6 +10,8 @@ import (
"fmt"
"io"
"os"
"reflect"
"strings"
"testing"
"time"
)
......@@ -108,6 +110,38 @@ var untarTests = []*untarTest{
},
},
},
{
file: "testdata/pax.tar",
headers: []*Header{
{
Name: "a/123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100",
Mode: 0664,
Uid: 1000,
Gid: 1000,
Uname: "shane",
Gname: "shane",
Size: 7,
ModTime: time.Unix(1350244992, 23960108),
ChangeTime: time.Unix(1350244992, 23960108),
AccessTime: time.Unix(1350244992, 23960108),
Typeflag: TypeReg,
},
{
Name: "a/b",
Mode: 0777,
Uid: 1000,
Gid: 1000,
Uname: "shane",
Gname: "shane",
Size: 0,
ModTime: time.Unix(1350266320, 910238425),
ChangeTime: time.Unix(1350266320, 910238425),
AccessTime: time.Unix(1350266320, 910238425),
Typeflag: TypeSymlink,
Linkname: "123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100",
},
},
},
}
func TestReader(t *testing.T) {
......@@ -133,7 +167,7 @@ testLoop:
}
hdr, err := tr.Next()
if err == io.EOF {
break
continue testLoop
}
if hdr != nil || err != nil {
t.Errorf("test %d: Unexpected entry or error: hdr=%v err=%v", i, hdr, err)
......@@ -260,3 +294,73 @@ func TestNonSeekable(t *testing.T) {
t.Errorf("Didn't process all files\nexpected: %d\nprocessed %d\n", len(test.headers), nread)
}
}
func TestParsePAXHeader(t *testing.T) {
paxTests := [][3]string{
{"a", "a=name", "10 a=name\n"}, // Test case involving multiple acceptable lengths
{"a", "a=name", "9 a=name\n"}, // Test case involving multiple acceptable length
{"mtime", "mtime=1350244992.023960108", "30 mtime=1350244992.023960108\n"}}
for _, test := range paxTests {
key, expected, raw := test[0], test[1], test[2]
reader := bytes.NewBuffer([]byte(raw))
headers, err := parsePAX(reader)
if err != nil {
t.Errorf("Couldn't parse correctly formatted headers: %v", err)
continue
}
if strings.EqualFold(headers[key], expected) {
t.Errorf("mtime header incorrectly parsed: got %s, wanted %s", headers[key], expected)
continue
}
trailer := make([]byte, 100)
n, err := reader.Read(trailer)
if err != io.EOF || n != 0 {
t.Error("Buffer wasn't consumed")
}
}
badHeader := bytes.NewBuffer([]byte("3 somelongkey="))
if _, err := parsePAX(badHeader); err != ErrHeader {
t.Fatal("Unexpected success when parsing bad header")
}
}
func TestParsePAXTime(t *testing.T) {
// Some valid PAX time values
timestamps := map[string]time.Time{
"1350244992.023960108": time.Unix(1350244992, 23960108), // The commoon case
"1350244992.02396010": time.Unix(1350244992, 23960100), // Lower precision value
"1350244992.0239601089": time.Unix(1350244992, 23960108), // Higher precision value
"1350244992": time.Unix(1350244992, 0), // Low precision value
}
for input, expected := range timestamps {
ts, err := parsePAXTime(input)
if err != nil {
t.Fatal(err)
}
if !ts.Equal(expected) {
t.Fatal("Time parsing failure %s %s", ts, expected)
}
}
}
func TestMergePAX(t *testing.T) {
hdr := new(Header)
// Test a string, integer, and time based value.
headers := map[string]string{
"path": "a/b/c",
"uid": "1000",
"mtime": "1350244992.023960108",
}
err := mergePAX(hdr, headers)
if err != nil {
t.Fatal(err)
}
want := &Header{
Name: "a/b/c",
Uid: 1000,
ModTime: time.Unix(1350244992, 23960108),
}
if !reflect.DeepEqual(hdr, want) {
t.Errorf("incorrect merge: got %+v, want %+v", hdr, want)
}
}
......@@ -8,10 +8,14 @@ package tar
// - catch more errors (no first header, etc.)
import (
"bytes"
"errors"
"fmt"
"io"
"os"
"path"
"strconv"
"strings"
"time"
)
......@@ -19,6 +23,7 @@ var (
ErrWriteTooLong = errors.New("archive/tar: write too long")
ErrFieldTooLong = errors.New("archive/tar: header field too long")
ErrWriteAfterClose = errors.New("archive/tar: write after close")
errNameTooLong = errors.New("archive/tar: name too long")
)
// A Writer provides sequential writing of a tar archive in POSIX.1 format.
......@@ -119,15 +124,31 @@ func (tw *Writer) WriteHeader(hdr *Header) error {
if tw.err != nil {
return tw.err
}
// Decide whether or not to use PAX extensions
// TODO(shanemhansen): we might want to use PAX headers for
// subsecond time resolution, but for now let's just capture
// the long name/long symlink use case.
suffix := hdr.Name
prefix := ""
if len(hdr.Name) > fileNameSize || len(hdr.Linkname) > fileNameSize {
var err error
prefix, suffix, err = tw.splitUSTARLongName(hdr.Name)
// Either we were unable to pack the long name into ustar format
// or the link name is too long; use PAX headers.
if err == errNameTooLong || len(hdr.Linkname) > fileNameSize {
if err := tw.writePAXHeader(hdr); err != nil {
return err
}
} else if err != nil {
return err
}
}
tw.nb = int64(hdr.Size)
tw.pad = -tw.nb & (blockSize - 1) // blockSize is a power of two
header := make([]byte, blockSize)
s := slicer(header)
// TODO(dsymonds): handle names longer than 100 chars
copy(s.next(100), []byte(hdr.Name))
tw.cString(s.next(fileNameSize), suffix)
// Handle out of range ModTime carefully.
var modTime int64
......@@ -148,11 +169,15 @@ func (tw *Writer) WriteHeader(hdr *Header) error {
tw.cString(s.next(32), hdr.Gname) // 297:329
tw.numeric(s.next(8), hdr.Devmajor) // 329:337
tw.numeric(s.next(8), hdr.Devminor) // 337:345
tw.cString(s.next(155), prefix) // 345:500
// Use the GNU magic instead of POSIX magic if we used any GNU extensions.
if tw.usedBinary {
copy(header[257:265], []byte("ustar \x00"))
}
// Use the ustar magic if we used ustar long names.
if len(prefix) > 0 {
copy(header[257:265], []byte("ustar\000"))
}
// The chksum field is terminated by a NUL and a space.
// This is different from the other octal fields.
......@@ -170,6 +195,78 @@ func (tw *Writer) WriteHeader(hdr *Header) error {
return tw.err
}
// writeUSTARLongName splits a USTAR long name hdr.Name.
// name must be < 256 characters. errNameTooLong is returned
// if hdr.Name can't be split. The splitting heuristic
// is compatible with gnu tar.
func (tw *Writer) splitUSTARLongName(name string) (prefix, suffix string, err error) {
length := len(name)
if length > fileNamePrefixSize+1 {
length = fileNamePrefixSize + 1
} else if name[length-1] == '/' {
length--
}
i := strings.LastIndex(name[:length], "/")
nlen := length - i - 1
if i <= 0 || nlen > fileNameSize || nlen == 0 {
err = errNameTooLong
return
}
prefix, suffix = name[:i], name[i+1:]
return
}
// writePaxHeader writes an extended pax header to the
// archive.
func (tw *Writer) writePAXHeader(hdr *Header) error {
// Prepare extended header
ext := new(Header)
ext.Typeflag = TypeXHeader
// Setting ModTime is required for reader parsing to
// succeed, and seems harmless enough.
ext.ModTime = hdr.ModTime
// The spec asks that we namespace our pseudo files
// with the current pid.
pid := os.Getpid()
dir, file := path.Split(hdr.Name)
ext.Name = path.Join(dir,
fmt.Sprintf("PaxHeaders.%d", pid), file)[0:100]
// Construct the body
var buf bytes.Buffer
if len(hdr.Name) > fileNameSize {
fmt.Fprint(&buf, paxHeader("path="+hdr.Name))
}
if len(hdr.Linkname) > fileNameSize {
fmt.Fprint(&buf, paxHeader("linkpath="+hdr.Linkname))
}
ext.Size = int64(len(buf.Bytes()))
if err := tw.WriteHeader(ext); err != nil {
return err
}
if _, err := tw.Write(buf.Bytes()); err != nil {
return err
}
if err := tw.Flush(); err != nil {
return err
}
return nil
}
// paxHeader formats a single pax record, prefixing it with the appropriate length
func paxHeader(msg string) string {
const padding = 2 // Extra padding for space and newline
size := len(msg) + padding
size += len(strconv.Itoa(size))
record := fmt.Sprintf("%d %s\n", size, msg)
if len(record) != size {
// Final adjustment if adding size increased
// the number of digits in size
size = len(record)
record = fmt.Sprintf("%d %s\n", size, msg)
}
return record
}
// Write writes to the current entry in the tar archive.
// Write returns the error ErrWriteTooLong if more than
// hdr.Size bytes are written after WriteHeader.
......
......@@ -9,6 +9,7 @@ import (
"fmt"
"io"
"io/ioutil"
"os"
"strings"
"testing"
"testing/iotest"
......@@ -101,6 +102,27 @@ var writerTests = []*writerTest{
},
},
},
// This file was produced using gnu tar 1.17
// gnutar -b 4 --format=ustar (longname/)*15 + file.txt
{
file: "testdata/ustar.tar",
entries: []*writerTestEntry{
{
header: &Header{
Name: strings.Repeat("longname/", 15) + "file.txt",
Mode: 0644,
Uid: 0765,
Gid: 024,
Size: 06,
ModTime: time.Unix(1360135598, 0),
Typeflag: '0',
Uname: "shane",
Gname: "staff",
},
contents: "hello\n",
},
},
},
}
// Render byte array in a two-character hexadecimal string, spaced for easy visual inspection.
......@@ -180,3 +202,61 @@ testLoop:
}
}
}
func TestPax(t *testing.T) {
// Create an archive with a large name
fileinfo, err := os.Stat("testdata/small.txt")
if err != nil {
t.Fatal(err)
}
hdr, err := FileInfoHeader(fileinfo, "")
if err != nil {
t.Fatalf("os.Stat: %v", err)
}
// Force a PAX long name to be written
longName := strings.Repeat("ab", 100)
contents := strings.Repeat(" ", int(hdr.Size))
hdr.Name = longName
var buf bytes.Buffer
writer := NewWriter(&buf)
if err := writer.WriteHeader(hdr); err != nil {
t.Fatal(err)
}
if _, err = writer.Write([]byte(contents)); err != nil {
t.Fatal(err)
}
if err := writer.Close(); err != nil {
t.Fatal(err)
}
// Simple test to make sure PAX extensions are in effect
if !bytes.Contains(buf.Bytes(), []byte("PaxHeaders.")) {
t.Fatal("Expected at least one PAX header to be written.")
}
// Test that we can get a long name back out of the archive.
reader := NewReader(&buf)
hdr, err = reader.Next()
if err != nil {
t.Fatal(err)
}
if hdr.Name != longName {
t.Fatal("Couldn't recover long file name")
}
}
func TestPAXHeader(t *testing.T) {
medName := strings.Repeat("CD", 50)
longName := strings.Repeat("AB", 100)
paxTests := [][2]string{
{"name=/etc/hosts", "19 name=/etc/hosts\n"},
{"a=b", "6 a=b\n"}, // Single digit length
{"a=names", "11 a=names\n"}, // Test case involving carries
{"name=" + longName, fmt.Sprintf("210 name=%s\n", longName)},
{"name=" + medName, fmt.Sprintf("110 name=%s\n", medName)}}
for _, test := range paxTests {
key, expected := test[0], test[1]
if result := paxHeader(key); result != expected {
t.Fatalf("paxHeader: got %s, expected %s", result, expected)
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment