misc/nacl: include parser.go for cmd/compile/internal/syntax tests

Fix suggested by Minux. Change-Id: Ia7aa8ccccc16535af4ec3ad23830ef0aa5d776ac Reviewed-on: https://go-review.googlesource.com/27193 Run-TryBot: Matthew Dempsky <mdempsky@google.com> Reviewed-by: Minux Ma <minux@golang.org>
cmd/compile/internal/syntax: match old parser errors and line numbers
2026-01-30 07:32:05 +03:00 · 2016-08-16 22:42:13 +00:00 · 2016-08-16 14:32:09 -07:00 · 2016-08-16 14:32:09 -07:00 · 2016-08-16 14:32:09 -07:00 · 2016-08-16 14:32:09 -07:00
221 changed files with 98597 additions and 10569 deletions
--- a/doc/contrib.html
+++ b/doc/contrib.html
@@ -34,6 +34,7 @@ We encourage all Go users to subscribe to
 <p>A <a href="/doc/devel/release.html">summary</a> of the changes between Go releases. Notes for the major releases:</p>

 <ul>
+	<li><a href="/doc/go1.7">Go 1.7</a> <small>(August 2016)</small></li>
 	<li><a href="/doc/go1.6">Go 1.6</a> <small>(February 2016)</small></li>
 	<li><a href="/doc/go1.5">Go 1.5</a> <small>(August 2015)</small></li>
 	<li><a href="/doc/go1.4">Go 1.4</a> <small>(December 2014)</small></li>
--- a/doc/devel/release.html
+++ b/doc/devel/release.html
@@ -30,6 +30,13 @@ to fix critical security problems in both Go 1.4 and Go 1.5 as they arise.
 See the <a href="/security">security policy</a> for more details.
 </p>

+<h2 id="go1.7">go1.7 (released 2016/08/15)</h2>
+
+<p>
+Go 1.7 is a major release of Go.
+Read the <a href="/doc/go1.7">Go 1.7 Release Notes</a> for more information.
+</p>
+
 <h2 id="go1.6">go1.6 (released 2016/02/17)</h2>

 <p>
--- a/doc/go1.7.html
+++ b/doc/go1.7.html
@@ -1,5 +1,5 @@
 <!--{
-	"Title": "Go 1.7 Release Notes DRAFT",
+	"Title": "Go 1.7 Release Notes",
 	"Path":  "/doc/go1.7",
 	"Template": true
 }-->
@@ -25,15 +25,6 @@ Do not send CLs removing the interior tags from such phrases.
 ul li { margin: 0.5em 0; }
 </style>

-<p>
-<!-- TODO: REMOVE THIS COMMENT -->
-<!-- TODO: Also remove "DRAFT" in the "Title" at the top of this file. -->
-<i>NOTE: This is a DRAFT of the Go 1.7 release notes, prepared for the Go 1.7 beta.
-Go 1.7 has NOT yet been released.
-By our regular schedule, it is expected some time in August 2016.
-</i>
-</p>
-
 <h2 id="introduction">Introduction to Go 1.7</h2>

 <p>
@@ -367,7 +358,7 @@ the code generation changes alone typically reduce program CPU time by 5-35%.
 </p>

 <p>
-<!-- git log &#45&#45grep '-[0-9][0-9]\.[0-9][0-9]%' go1.6.. -->
+<!-- git log -''-grep '-[0-9][0-9]\.[0-9][0-9]%' go1.6.. -->
 There have been significant optimizations bringing more than 10% improvements
 to implementations in the
 <a href="/pkg/crypto/sha1/"><code>crypto/sha1</code></a>,
@@ -562,10 +553,9 @@ The

 <dd>
 <p>
-As noted above,
-there are significant performance optimizations throughout the package.
+There are many performance optimizations throughout the package.
 Decompression speed is improved by about 10%,
-while compression speed for <code>DefaultCompression</code> is roughly doubled.
+while compression for <code>DefaultCompression</code> is twice as fast.
 </p>

 <p>
--- a/doc/install-source.html
+++ b/doc/install-source.html
@@ -203,7 +203,7 @@ To build without <code>cgo</code>, set the environment variable
 Change to the directory that will be its parent
 and make sure the <code>go</code> directory does not exist.
 Then clone the repository and check out the latest release tag
-(<code class="versionTag">go1.6</code>, for example):</p>
+(<code class="versionTag">go1.7</code>, for example):</p>

 <pre>
 $ git clone https://go.googlesource.com/go
@@ -391,7 +391,7 @@ New releases are announced on the
 <a href="//groups.google.com/group/golang-announce">golang-announce</a>
 mailing list.
 Each announcement mentions the latest release tag, for instance,
-<code class="versionTag">go1.6</code>.
+<code class="versionTag">go1.7</code>.
 </p>

 <p>
--- a/misc/nacl/testzip.proto
+++ b/misc/nacl/testzip.proto
@@ -18,6 +18,10 @@ go	src=..
 					asm
 						testdata
 							+
+			compile
+				internal
+					syntax
+						parser.go
 			doc
 				main.go
 				pkg.go
--- a/src/archive/tar/writer.go
+++ b/src/archive/tar/writer.go
@@ -317,7 +317,7 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
 	var buf bytes.Buffer

 	// Keys are sorted before writing to body to allow deterministic output.
-	var keys []string
+	keys := make([]string, 0, len(paxHeaders))
 	for k := range paxHeaders {
 		keys = append(keys, k)
 	}
--- a/src/bytes/example_test.go
+++ b/src/bytes/example_test.go
@@ -11,6 +11,7 @@ import (
 	"io"
 	"os"
 	"sort"
+	"unicode"
 )

 func ExampleBuffer() {
@@ -83,3 +84,205 @@ func ExampleTrimPrefix() {
 	fmt.Printf("Hello%s", b)
 	// Output: Hello, world!
 }
+
+func ExampleFields() {
+	fmt.Printf("Fields are: %q", bytes.Fields([]byte("  foo bar  baz   ")))
+	// Output: Fields are: ["foo" "bar" "baz"]
+}
+
+func ExampleFieldsFunc() {
+	f := func(c rune) bool {
+		return !unicode.IsLetter(c) && !unicode.IsNumber(c)
+	}
+	fmt.Printf("Fields are: %q", bytes.FieldsFunc([]byte("  foo1;bar2,baz3..."), f))
+	// Output: Fields are: ["foo1" "bar2" "baz3"]
+}
+
+func ExampleContains() {
+	fmt.Println(bytes.Contains([]byte("seafood"), []byte("foo")))
+	fmt.Println(bytes.Contains([]byte("seafood"), []byte("bar")))
+	fmt.Println(bytes.Contains([]byte("seafood"), []byte("")))
+	fmt.Println(bytes.Contains([]byte(""), []byte("")))
+	// Output:
+	// true
+	// false
+	// true
+	// true
+}
+
+func ExampleCount() {
+	fmt.Println(bytes.Count([]byte("cheese"), []byte("e")))
+	fmt.Println(bytes.Count([]byte("five"), []byte(""))) // before & after each rune
+	// Output:
+	// 3
+	// 5
+}
+
+func ExampleEqualFold() {
+	fmt.Println(bytes.EqualFold([]byte("Go"), []byte("go")))
+	// Output: true
+}
+
+func ExampleHasPrefix() {
+	fmt.Println(bytes.HasPrefix([]byte("Gopher"), []byte("Go")))
+	fmt.Println(bytes.HasPrefix([]byte("Gopher"), []byte("C")))
+	fmt.Println(bytes.HasPrefix([]byte("Gopher"), []byte("")))
+	// Output:
+	// true
+	// false
+	// true
+}
+
+func ExampleHasSuffix() {
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("go")))
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("O")))
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("Ami")))
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("")))
+	// Output:
+	// true
+	// false
+	// false
+	// true
+}
+
+func ExampleIndex() {
+	fmt.Println(bytes.Index([]byte("chicken"), []byte("ken")))
+	fmt.Println(bytes.Index([]byte("chicken"), []byte("dmr")))
+	// Output:
+	// 4
+	// -1
+}
+
+func ExampleIndexFunc() {
+	f := func(c rune) bool {
+		return unicode.Is(unicode.Han, c)
+	}
+	fmt.Println(bytes.IndexFunc([]byte("Hello, 世界"), f))
+	fmt.Println(bytes.IndexFunc([]byte("Hello, world"), f))
+	// Output:
+	// 7
+	// -1
+}
+
+func ExampleIndexAny() {
+	fmt.Println(bytes.IndexAny([]byte("chicken"), "aeiouy"))
+	fmt.Println(bytes.IndexAny([]byte("crwth"), "aeiouy"))
+	// Output:
+	// 2
+	// -1
+}
+
+func ExampleIndexRune() {
+	fmt.Println(bytes.IndexRune([]byte("chicken"), 'k'))
+	fmt.Println(bytes.IndexRune([]byte("chicken"), 'd'))
+	// Output:
+	// 4
+	// -1
+}
+
+func ExampleLastIndex() {
+	fmt.Println(bytes.Index([]byte("go gopher"), []byte("go")))
+	fmt.Println(bytes.LastIndex([]byte("go gopher"), []byte("go")))
+	fmt.Println(bytes.LastIndex([]byte("go gopher"), []byte("rodent")))
+	// Output:
+	// 0
+	// 3
+	// -1
+}
+
+func ExampleJoin() {
+	s := [][]byte{[]byte("foo"), []byte("bar"), []byte("baz")}
+	fmt.Printf("%s", bytes.Join(s, []byte(", ")))
+	// Output: foo, bar, baz
+}
+
+func ExampleRepeat() {
+	fmt.Printf("ba%s", bytes.Repeat([]byte("na"), 2))
+	// Output: banana
+}
+
+func ExampleReplace() {
+	fmt.Printf("%s\n", bytes.Replace([]byte("oink oink oink"), []byte("k"), []byte("ky"), 2))
+	fmt.Printf("%s\n", bytes.Replace([]byte("oink oink oink"), []byte("oink"), []byte("moo"), -1))
+	// Output:
+	// oinky oinky oink
+	// moo moo moo
+}
+
+func ExampleSplit() {
+	fmt.Printf("%q\n", bytes.Split([]byte("a,b,c"), []byte(",")))
+	fmt.Printf("%q\n", bytes.Split([]byte("a man a plan a canal panama"), []byte("a ")))
+	fmt.Printf("%q\n", bytes.Split([]byte(" xyz "), []byte("")))
+	fmt.Printf("%q\n", bytes.Split([]byte(""), []byte("Bernardo O'Higgins")))
+	// Output:
+	// ["a" "b" "c"]
+	// ["" "man " "plan " "canal panama"]
+	// [" " "x" "y" "z" " "]
+	// [""]
+}
+
+func ExampleSplitN() {
+	fmt.Printf("%q\n", bytes.SplitN([]byte("a,b,c"), []byte(","), 2))
+	z := bytes.SplitN([]byte("a,b,c"), []byte(","), 0)
+	fmt.Printf("%q (nil = %v)\n", z, z == nil)
+	// Output:
+	// ["a" "b,c"]
+	// [] (nil = true)
+}
+
+func ExampleSplitAfter() {
+	fmt.Printf("%q\n", bytes.SplitAfter([]byte("a,b,c"), []byte(",")))
+	// Output: ["a," "b," "c"]
+}
+
+func ExampleSplitAfterN() {
+	fmt.Printf("%q\n", bytes.SplitAfterN([]byte("a,b,c"), []byte(","), 2))
+	// Output: ["a," "b,c"]
+}
+
+func ExampleTitle() {
+	fmt.Printf("%s", bytes.Title([]byte("her royal highness")))
+	// Output: Her Royal Highness
+}
+
+func ExampleToTitle() {
+	fmt.Printf("%s\n", bytes.ToTitle([]byte("loud noises")))
+	fmt.Printf("%s\n", bytes.ToTitle([]byte("хлеб")))
+	// Output:
+	// LOUD NOISES
+	// ХЛЕБ
+}
+
+func ExampleTrim() {
+	fmt.Printf("[%q]", bytes.Trim([]byte(" !!! Achtung! Achtung! !!! "), "! "))
+	// Output: ["Achtung! Achtung"]
+}
+
+func ExampleMap() {
+	rot13 := func(r rune) rune {
+		switch {
+		case r >= 'A' && r <= 'Z':
+			return 'A' + (r-'A'+13)%26
+		case r >= 'a' && r <= 'z':
+			return 'a' + (r-'a'+13)%26
+		}
+		return r
+	}
+	fmt.Printf("%s", bytes.Map(rot13, []byte("'Twas brillig and the slithy gopher...")))
+	// Output: 'Gjnf oevyyvt naq gur fyvgul tbcure...
+}
+
+func ExampleTrimSpace() {
+	fmt.Printf("%s", bytes.TrimSpace([]byte(" \t\n a lone gopher \n\t\r\n")))
+	// Output: a lone gopher
+}
+
+func ExampleToUpper() {
+	fmt.Printf("%s", bytes.ToUpper([]byte("Gopher")))
+	// Output: GOPHER
+}
+
+func ExampleToLower() {
+	fmt.Printf("%s", bytes.ToLower([]byte("Gopher")))
+	// Output: gopher
+}
--- a/src/cmd/asm/internal/asm/asm.go
+++ b/src/cmd/asm/internal/asm/asm.go
@@ -403,7 +403,7 @@ func (p *Parser) asmJump(op obj.As, cond string, a []obj.Addr) {

 		fallthrough
 	default:
-		p.errorf("wrong number of arguments to %s instruction", obj.Aconv(op))
+		p.errorf("wrong number of arguments to %s instruction", op)
 		return
 	}
 	switch {
@@ -476,7 +476,7 @@ func (p *Parser) branch(jmp, target *obj.Prog) {
 // asmInstruction assembles an instruction.
 // MOVW R9, (R10)
 func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
-	// fmt.Printf("%s %+v\n", obj.Aconv(op), a)
+	// fmt.Printf("%s %+v\n", op, a)
 	prog := &obj.Prog{
 		Ctxt:   p.ctxt,
 		Lineno: p.histLineNum,
@@ -525,7 +525,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 					prog.To = a[1]
 					break
 				}
-				p.errorf("unrecognized addressing for %s", obj.Aconv(op))
+				p.errorf("unrecognized addressing for %s", op)
 				return
 			}
 			if arch.IsARMFloatCmp(op) {
@@ -572,7 +572,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			// Catch missing operand here, because we store immediate as part of From3, and can't distinguish
 			// missing operand from legal value 0 in obj/x86/asm6.
 			if arch.IsAMD4OP(op) {
-				p.errorf("4 operands required, but only 3 are provided for %s instruction", obj.Aconv(op))
+				p.errorf("4 operands required, but only 3 are provided for %s instruction", op)
 			}
 			prog.From = a[0]
 			prog.From3 = newAddr(a[1])
@@ -583,7 +583,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 				prog.From = a[0]
 				prog.To = a[1]
 				if a[2].Type != obj.TYPE_REG {
-					p.errorf("invalid addressing modes for third operand to %s instruction, must be register", obj.Aconv(op))
+					p.errorf("invalid addressing modes for third operand to %s instruction, must be register", op)
 					return
 				}
 				prog.RegTo2 = a[2].Reg
@@ -619,7 +619,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 				prog.From3 = newAddr(a[1])
 				prog.To = a[2]
 			default:
-				p.errorf("invalid addressing modes for %s instruction", obj.Aconv(op))
+				p.errorf("invalid addressing modes for %s instruction", op)
 				return
 			}
 		case sys.S390X:
@@ -656,10 +656,10 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.From = a[1]
 			prog.From3 = newAddr(a[2])
 			if a[0].Type != obj.TYPE_CONST {
-				p.errorf("first operand must be an immediate in %s instruction", obj.Aconv(op))
+				p.errorf("first operand must be an immediate in %s instruction", op)
 			}
 			if prog.From3.Type != obj.TYPE_REG {
-				p.errorf("third operand must be a register in %s instruction", obj.Aconv(op))
+				p.errorf("third operand must be a register in %s instruction", op)
 			}
 			prog.From3.Offset = int64(p.getImmediate(prog, op, &a[0]))
 			prog.To = a[3]
@@ -690,7 +690,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.To = a[3]
 			break
 		}
-		p.errorf("can't handle %s instruction with 4 operands", obj.Aconv(op))
+		p.errorf("can't handle %s instruction with 4 operands", op)
 		return
 	case 5:
 		if p.arch.Family == sys.PPC64 && arch.IsPPC64RLD(op) {
@@ -712,7 +712,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.To = a[4]
 			break
 		}
-		p.errorf("can't handle %s instruction with 5 operands", obj.Aconv(op))
+		p.errorf("can't handle %s instruction with 5 operands", op)
 		return
 	case 6:
 		if p.arch.Family == sys.ARM && arch.IsARMMRC(op) {
@@ -736,7 +736,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 		}
 		fallthrough
 	default:
-		p.errorf("can't handle %s instruction with %d operands", obj.Aconv(op), len(a))
+		p.errorf("can't handle %s instruction with %d operands", op, len(a))
 		return
 	}

@@ -771,7 +771,7 @@ func (p *Parser) getConstantPseudo(pseudo string, addr *obj.Addr) int64 {
 // getConstant checks that addr represents a plain constant and returns its value.
 func (p *Parser) getConstant(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 	if addr.Type != obj.TYPE_MEM || addr.Name != 0 || addr.Reg != 0 || addr.Index != 0 {
-		p.errorf("%s: expected integer constant; found %s", obj.Aconv(op), obj.Dconv(prog, addr))
+		p.errorf("%s: expected integer constant; found %s", op, obj.Dconv(prog, addr))
 	}
 	return addr.Offset
 }
@@ -779,7 +779,7 @@ func (p *Parser) getConstant(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 // getImmediate checks that addr represents an immediate constant and returns its value.
 func (p *Parser) getImmediate(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 	if addr.Type != obj.TYPE_CONST || addr.Name != 0 || addr.Reg != 0 || addr.Index != 0 {
-		p.errorf("%s: expected immediate constant; found %s", obj.Aconv(op), obj.Dconv(prog, addr))
+		p.errorf("%s: expected immediate constant; found %s", op, obj.Dconv(prog, addr))
 	}
 	return addr.Offset
 }
@@ -787,7 +787,7 @@ func (p *Parser) getImmediate(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 // getRegister checks that addr represents a register and returns its value.
 func (p *Parser) getRegister(prog *obj.Prog, op obj.As, addr *obj.Addr) int16 {
 	if addr.Type != obj.TYPE_REG || addr.Offset != 0 || addr.Name != 0 || addr.Index != 0 {
-		p.errorf("%s: expected register; found %s", obj.Aconv(op), obj.Dconv(prog, addr))
+		p.errorf("%s: expected register; found %s", op, obj.Dconv(prog, addr))
 	}
 	return addr.Reg
 }
--- a/src/cmd/asm/internal/asm/operand_test.go
+++ b/src/cmd/asm/internal/asm/operand_test.go
@@ -17,6 +17,7 @@ import (

 func setArch(goarch string) (*arch.Arch, *obj.Link) {
 	os.Setenv("GOOS", "linux") // obj can handle this OS for all architectures.
+	os.Setenv("GOARCH", goarch)
 	architecture := arch.Set(goarch)
 	if architecture == nil {
 		panic("asm: unrecognized architecture " + goarch)
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -239,89 +239,87 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		}
 		opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1]))

-	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW,
-		ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU,
-		ssa.OpAMD64MODQ, ssa.OpAMD64MODL, ssa.OpAMD64MODW,
-		ssa.OpAMD64MODQU, ssa.OpAMD64MODLU, ssa.OpAMD64MODWU:
+	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
+		// Arg[0] (the dividend) is in AX.
+		// Arg[1] (the divisor) can be in any other register.
+		// Result[0] (the quotient) is in AX.
+		// Result[1] (the remainder) is in DX.
+		r := gc.SSARegNum(v.Args[1])

-		// Arg[0] is already in AX as it's the only register we allow
-		// and AX is the only output
-		x := gc.SSARegNum(v.Args[1])
-
-		// CPU faults upon signed overflow, which occurs when most
-		// negative int is divided by -1.
-		var j *obj.Prog
-		if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL ||
-			v.Op == ssa.OpAMD64DIVW || v.Op == ssa.OpAMD64MODQ ||
-			v.Op == ssa.OpAMD64MODL || v.Op == ssa.OpAMD64MODW {
-
-			var c *obj.Prog
-			switch v.Op {
-			case ssa.OpAMD64DIVQ, ssa.OpAMD64MODQ:
-				c = gc.Prog(x86.ACMPQ)
-				j = gc.Prog(x86.AJEQ)
-				// go ahead and sign extend to save doing it later
-				gc.Prog(x86.ACQO)
-
-			case ssa.OpAMD64DIVL, ssa.OpAMD64MODL:
-				c = gc.Prog(x86.ACMPL)
-				j = gc.Prog(x86.AJEQ)
-				gc.Prog(x86.ACDQ)
-
-			case ssa.OpAMD64DIVW, ssa.OpAMD64MODW:
-				c = gc.Prog(x86.ACMPW)
-				j = gc.Prog(x86.AJEQ)
-				gc.Prog(x86.ACWD)
-			}
-			c.From.Type = obj.TYPE_REG
-			c.From.Reg = x
-			c.To.Type = obj.TYPE_CONST
-			c.To.Offset = -1
-
-			j.To.Type = obj.TYPE_BRANCH
-
-		}
-
-		// for unsigned ints, we sign extend by setting DX = 0
-		// signed ints were sign extended above
-		if v.Op == ssa.OpAMD64DIVQU || v.Op == ssa.OpAMD64MODQU ||
-			v.Op == ssa.OpAMD64DIVLU || v.Op == ssa.OpAMD64MODLU ||
-			v.Op == ssa.OpAMD64DIVWU || v.Op == ssa.OpAMD64MODWU {
-			c := gc.Prog(x86.AXORQ)
-			c.From.Type = obj.TYPE_REG
-			c.From.Reg = x86.REG_DX
-			c.To.Type = obj.TYPE_REG
-			c.To.Reg = x86.REG_DX
-		}
+		// Zero extend dividend.
+		c := gc.Prog(x86.AXORL)
+		c.From.Type = obj.TYPE_REG
+		c.From.Reg = x86.REG_DX
+		c.To.Type = obj.TYPE_REG
+		c.To.Reg = x86.REG_DX

+		// Issue divide.
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x
+		p.From.Reg = r

-		// signed division, rest of the check for -1 case
-		if j != nil {
-			j2 := gc.Prog(obj.AJMP)
-			j2.To.Type = obj.TYPE_BRANCH
+	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
+		// Arg[0] (the dividend) is in AX.
+		// Arg[1] (the divisor) can be in any other register.
+		// Result[0] (the quotient) is in AX.
+		// Result[1] (the remainder) is in DX.
+		r := gc.SSARegNum(v.Args[1])

-			var n *obj.Prog
-			if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL ||
-				v.Op == ssa.OpAMD64DIVW {
-				// n * -1 = -n
-				n = gc.Prog(x86.ANEGQ)
-				n.To.Type = obj.TYPE_REG
-				n.To.Reg = x86.REG_AX
-			} else {
-				// n % -1 == 0
-				n = gc.Prog(x86.AXORQ)
-				n.From.Type = obj.TYPE_REG
-				n.From.Reg = x86.REG_DX
-				n.To.Type = obj.TYPE_REG
-				n.To.Reg = x86.REG_DX
-			}
-
-			j.To.Val = n
-			j2.To.Val = s.Pc()
+		// CPU faults upon signed overflow, which occurs when the most
+		// negative int is divided by -1. Handle divide by -1 as a special case.
+		var c *obj.Prog
+		switch v.Op {
+		case ssa.OpAMD64DIVQ:
+			c = gc.Prog(x86.ACMPQ)
+		case ssa.OpAMD64DIVL:
+			c = gc.Prog(x86.ACMPL)
+		case ssa.OpAMD64DIVW:
+			c = gc.Prog(x86.ACMPW)
 		}
+		c.From.Type = obj.TYPE_REG
+		c.From.Reg = r
+		c.To.Type = obj.TYPE_CONST
+		c.To.Offset = -1
+		j1 := gc.Prog(x86.AJEQ)
+		j1.To.Type = obj.TYPE_BRANCH
+
+		// Sign extend dividend.
+		switch v.Op {
+		case ssa.OpAMD64DIVQ:
+			gc.Prog(x86.ACQO)
+		case ssa.OpAMD64DIVL:
+			gc.Prog(x86.ACDQ)
+		case ssa.OpAMD64DIVW:
+			gc.Prog(x86.ACWD)
+		}
+
+		// Issue divide.
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r
+
+		// Skip over -1 fixup code.
+		j2 := gc.Prog(obj.AJMP)
+		j2.To.Type = obj.TYPE_BRANCH
+
+		// Issue -1 fixup code.
+		// n / -1 = -n
+		n1 := gc.Prog(x86.ANEGQ)
+		n1.To.Type = obj.TYPE_REG
+		n1.To.Reg = x86.REG_AX
+
+		// n % -1 == 0
+		n2 := gc.Prog(x86.AXORL)
+		n2.From.Type = obj.TYPE_REG
+		n2.From.Reg = x86.REG_DX
+		n2.To.Type = obj.TYPE_REG
+		n2.To.Reg = x86.REG_DX
+
+		// TODO(khr): issue only the -1 fixup code we need.
+		// For instance, if only the quotient is used, no point in zeroing the remainder.
+
+		j1.To.Val = n1
+		j2.To.Val = s.Pc()

 	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULW, ssa.OpAMD64HMULB,
 		ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU, ssa.OpAMD64HMULWU, ssa.OpAMD64HMULBU:
@@ -500,8 +498,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
-	case ssa.OpAMD64LEAQ:
-		p := gc.Prog(x86.ALEAQ)
+	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL:
+		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = gc.SSARegNum(v.Args[0])
 		gc.AddAux(&p.From, v)
@@ -705,7 +703,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg))
 		p.To.Offset = v.AuxInt

-	case ssa.OpCopy, ssa.OpAMD64MOVQconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
+	case ssa.OpCopy, ssa.OpAMD64MOVQconvert, ssa.OpAMD64MOVLconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
 		if v.Type.IsMemory() {
 			return
 		}
@@ -754,27 +752,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Name = obj.NAME_AUTO
 		}
 	case ssa.OpPhi:
-		// just check to make sure regalloc and stackalloc did it right
-		if v.Type.IsMemory() {
-			return
-		}
-		f := v.Block.Func
-		loc := f.RegAlloc[v.ID]
-		for _, a := range v.Args {
-			if aloc := f.RegAlloc[a.ID]; aloc != loc { // TODO: .Equal() instead?
-				v.Fatalf("phi arg at different location than phi: %v @ %v, but arg %v @ %v\n%s\n", v, loc, a, aloc, v.Block.Func)
-			}
-		}
+		gc.CheckLoweredPhi(v)
 	case ssa.OpInitMem:
 		// memory arg needs no code
 	case ssa.OpArg:
 		// input args need no code
 	case ssa.OpAMD64LoweredGetClosurePtr:
-		// Output is hardwired to DX only,
-		// and DX contains the closure pointer on
-		// closure entry, and this "instruction"
-		// is scheduled to the very beginning
-		// of the entry block.
+		// Closure pointer is DX.
+		gc.CheckLoweredGetClosurePtr(v)
 	case ssa.OpAMD64LoweredGetG:
 		r := gc.SSARegNum(v)
 		// See the comments in cmd/internal/obj/x86/obj6.go
@@ -871,6 +856,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = gc.SSARegNum(v)
 	case ssa.OpSP, ssa.OpSB:
 		// nothing to do
+	case ssa.OpSelect0, ssa.OpSelect1:
+		// nothing to do
 	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
 		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
 		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
--- a/src/cmd/compile/internal/arm/peep.go
+++ b/src/cmd/compile/internal/arm/peep.go
@@ -538,7 +538,7 @@ gotit:
 	}

 	if gc.Debug['P'] != 0 {
-		fmt.Printf(" => %v\n", obj.Aconv(p.As))
+		fmt.Printf(" => %v\n", p.As)
 	}
 	return true
 }
@@ -1036,7 +1036,7 @@ func xtramodes(g *gc.Graph, r *gc.Flow, a *obj.Addr) bool {
 func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case arm.AMOVM:
--- a/src/cmd/compile/internal/arm/prog.go
+++ b/src/cmd/compile/internal/arm/prog.go
@@ -79,6 +79,8 @@ var progtable = [arm.ALAST & obj.AMask]obj.ProgInfo{
 	arm.AMULF & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | RightRdwr},
 	arm.ASUBD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | RightRdwr},
 	arm.ASUBF & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | RightRdwr},
+	arm.ANEGD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | RightRdwr},
+	arm.ANEGF & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | RightRdwr},
 	arm.ASQRTD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | RightRdwr},

 	// Conversions.
--- a/src/cmd/compile/internal/arm/ssa.go
+++ b/src/cmd/compile/internal/arm/ssa.go
--- a/src/cmd/compile/internal/arm64/galign.go
+++ b/src/cmd/compile/internal/arm64/galign.go
@@ -6,6 +6,7 @@ package arm64

 import (
 	"cmd/compile/internal/gc"
+	"cmd/compile/internal/ssa"
 	"cmd/internal/obj/arm64"
 )

@@ -61,6 +62,11 @@ func Main() {
 	gc.Thearch.Doregbits = doregbits
 	gc.Thearch.Regnames = regnames

+	gc.Thearch.SSARegToReg = ssaRegToReg
+	gc.Thearch.SSAMarkMoves = func(s *gc.SSAGenState, b *ssa.Block) {}
+	gc.Thearch.SSAGenValue = ssaGenValue
+	gc.Thearch.SSAGenBlock = ssaGenBlock
+
 	gc.Main()
 	gc.Exit(0)
 }
--- a/src/cmd/compile/internal/arm64/peep.go
+++ b/src/cmd/compile/internal/arm64/peep.go
@@ -162,7 +162,7 @@ loop1:
 			continue
 		}
 		if gc.Debug['P'] != 0 {
-			fmt.Printf("encoding $%d directly into %v in:\n%v\n%v\n", p.From.Offset, obj.Aconv(p1.As), p, p1)
+			fmt.Printf("encoding $%d directly into %v in:\n%v\n%v\n", p.From.Offset, p1.As, p, p1)
 		}
 		p1.From.Type = obj.TYPE_CONST
 		p1.From = p.From
@@ -423,7 +423,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case obj.ANOP, /* read p->from, write p->to */
--- a/src/cmd/compile/internal/arm64/prog.go
+++ b/src/cmd/compile/internal/arm64/prog.go
@@ -44,24 +44,37 @@ var progtable = [arm64.ALAST & obj.AMask]obj.ProgInfo{
 	// Integer
 	arm64.AADD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASUB & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.ANEG & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ANEG & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite}, // why RegRead? revisit once the old backend gone
 	arm64.AAND & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AORR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AEOR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ABIC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AMVN & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite},
 	arm64.AMUL & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AMULW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASMULL & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AUMULL & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.ASMULH & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.AUMULH & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ASMULH & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUMULH & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASDIV & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AUDIV & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ASDIVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUDIVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AREM & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUREM & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AREMW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUREMW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ALSL & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ALSR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AASR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ACMP & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead},
+	arm64.ACMPW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead},
 	arm64.AADC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.UseCarry},
 	arm64.AROR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ARORW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AADDS & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.SetCarry},
+	arm64.ACSET & obj.AMask:  {Flags: gc.SizeQ | gc.RightWrite},
+	arm64.ACSEL & obj.AMask:  {Flags: gc.SizeQ | gc.RegRead | gc.RightWrite},

 	// Floating point.
 	arm64.AFADDD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -0,0 +1,865 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package arm64
+
+import (
+	"math"
+
+	"cmd/compile/internal/gc"
+	"cmd/compile/internal/ssa"
+	"cmd/internal/obj"
+	"cmd/internal/obj/arm64"
+)
+
+var ssaRegToReg = []int16{
+	arm64.REG_R0,
+	arm64.REG_R1,
+	arm64.REG_R2,
+	arm64.REG_R3,
+	arm64.REG_R4,
+	arm64.REG_R5,
+	arm64.REG_R6,
+	arm64.REG_R7,
+	arm64.REG_R8,
+	arm64.REG_R9,
+	arm64.REG_R10,
+	arm64.REG_R11,
+	arm64.REG_R12,
+	arm64.REG_R13,
+	arm64.REG_R14,
+	arm64.REG_R15,
+	arm64.REG_R16,
+	arm64.REG_R17,
+	arm64.REG_R18, // platform register, not used
+	arm64.REG_R19,
+	arm64.REG_R20,
+	arm64.REG_R21,
+	arm64.REG_R22,
+	arm64.REG_R23,
+	arm64.REG_R24,
+	arm64.REG_R25,
+	arm64.REG_R26,
+	// R27 = REGTMP not used in regalloc
+	arm64.REGG,    // R28
+	arm64.REG_R29, // frame pointer, not used
+	// R30 = REGLINK not used in regalloc
+	arm64.REGSP, // R31
+
+	arm64.REG_F0,
+	arm64.REG_F1,
+	arm64.REG_F2,
+	arm64.REG_F3,
+	arm64.REG_F4,
+	arm64.REG_F5,
+	arm64.REG_F6,
+	arm64.REG_F7,
+	arm64.REG_F8,
+	arm64.REG_F9,
+	arm64.REG_F10,
+	arm64.REG_F11,
+	arm64.REG_F12,
+	arm64.REG_F13,
+	arm64.REG_F14,
+	arm64.REG_F15,
+	arm64.REG_F16,
+	arm64.REG_F17,
+	arm64.REG_F18,
+	arm64.REG_F19,
+	arm64.REG_F20,
+	arm64.REG_F21,
+	arm64.REG_F22,
+	arm64.REG_F23,
+	arm64.REG_F24,
+	arm64.REG_F25,
+	arm64.REG_F26,
+	arm64.REG_F27,
+	arm64.REG_F28,
+	arm64.REG_F29,
+	arm64.REG_F30,
+	arm64.REG_F31,
+
+	arm64.REG_NZCV, // flag
+	0,              // SB isn't a real register.  We fill an Addr.Reg field with 0 in this case.
+}
+
+// Smallest possible faulting page at address zero,
+// see ../../../../runtime/mheap.go:/minPhysPageSize
+const minZeroPage = 4096
+
+// loadByType returns the load instruction of the given type.
+func loadByType(t ssa.Type) obj.As {
+	if t.IsFloat() {
+		switch t.Size() {
+		case 4:
+			return arm64.AFMOVS
+		case 8:
+			return arm64.AFMOVD
+		}
+	} else {
+		switch t.Size() {
+		case 1:
+			if t.IsSigned() {
+				return arm64.AMOVB
+			} else {
+				return arm64.AMOVBU
+			}
+		case 2:
+			if t.IsSigned() {
+				return arm64.AMOVH
+			} else {
+				return arm64.AMOVHU
+			}
+		case 4:
+			if t.IsSigned() {
+				return arm64.AMOVW
+			} else {
+				return arm64.AMOVWU
+			}
+		case 8:
+			return arm64.AMOVD
+		}
+	}
+	panic("bad load type")
+}
+
+// storeByType returns the store instruction of the given type.
+func storeByType(t ssa.Type) obj.As {
+	if t.IsFloat() {
+		switch t.Size() {
+		case 4:
+			return arm64.AFMOVS
+		case 8:
+			return arm64.AFMOVD
+		}
+	} else {
+		switch t.Size() {
+		case 1:
+			return arm64.AMOVB
+		case 2:
+			return arm64.AMOVH
+		case 4:
+			return arm64.AMOVW
+		case 8:
+			return arm64.AMOVD
+		}
+	}
+	panic("bad store type")
+}
+
+// makeshift encodes a register shifted by a constant, used as an Offset in Prog
+func makeshift(reg int16, typ int64, s int64) int64 {
+	return int64(reg&31)<<16 | typ | (s&63)<<10
+}
+
+// genshift generates a Prog for r = r0 op (r1 shifted by s)
+func genshift(as obj.As, r0, r1, r int16, typ int64, s int64) *obj.Prog {
+	p := gc.Prog(as)
+	p.From.Type = obj.TYPE_SHIFT
+	p.From.Offset = makeshift(r1, typ, s)
+	p.Reg = r0
+	if r != 0 {
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	}
+	return p
+}
+
+func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
+	s.SetLineno(v.Line)
+	switch v.Op {
+	case ssa.OpInitMem:
+		// memory arg needs no code
+	case ssa.OpArg:
+		// input args need no code
+	case ssa.OpSP, ssa.OpSB, ssa.OpGetG:
+		// nothing to do
+	case ssa.OpCopy, ssa.OpARM64MOVDconvert, ssa.OpARM64MOVDreg:
+		if v.Type.IsMemory() {
+			return
+		}
+		x := gc.SSARegNum(v.Args[0])
+		y := gc.SSARegNum(v)
+		if x == y {
+			return
+		}
+		as := arm64.AMOVD
+		if v.Type.IsFloat() {
+			switch v.Type.Size() {
+			case 4:
+				as = arm64.AFMOVS
+			case 8:
+				as = arm64.AFMOVD
+			default:
+				panic("bad float size")
+			}
+		}
+		p := gc.Prog(as)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = x
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = y
+	case ssa.OpARM64MOVDnop:
+		if gc.SSARegNum(v) != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		// nothing to do
+	case ssa.OpLoadReg:
+		if v.Type.IsFlags() {
+			v.Unimplementedf("load flags not implemented: %v", v.LongString())
+			return
+		}
+		p := gc.Prog(loadByType(v.Type))
+		n, off := gc.AutoVar(v.Args[0])
+		p.From.Type = obj.TYPE_MEM
+		p.From.Node = n
+		p.From.Sym = gc.Linksym(n.Sym)
+		p.From.Offset = off
+		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
+			p.From.Name = obj.NAME_PARAM
+			p.From.Offset += n.Xoffset
+		} else {
+			p.From.Name = obj.NAME_AUTO
+		}
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpPhi:
+		gc.CheckLoweredPhi(v)
+	case ssa.OpStoreReg:
+		if v.Type.IsFlags() {
+			v.Unimplementedf("store flags not implemented: %v", v.LongString())
+			return
+		}
+		p := gc.Prog(storeByType(v.Type))
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		n, off := gc.AutoVar(v)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Node = n
+		p.To.Sym = gc.Linksym(n.Sym)
+		p.To.Offset = off
+		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
+			p.To.Name = obj.NAME_PARAM
+			p.To.Offset += n.Xoffset
+		} else {
+			p.To.Name = obj.NAME_AUTO
+		}
+	case ssa.OpARM64ADD,
+		ssa.OpARM64SUB,
+		ssa.OpARM64AND,
+		ssa.OpARM64OR,
+		ssa.OpARM64XOR,
+		ssa.OpARM64BIC,
+		ssa.OpARM64MUL,
+		ssa.OpARM64MULW,
+		ssa.OpARM64MULH,
+		ssa.OpARM64UMULH,
+		ssa.OpARM64MULL,
+		ssa.OpARM64UMULL,
+		ssa.OpARM64DIV,
+		ssa.OpARM64UDIV,
+		ssa.OpARM64DIVW,
+		ssa.OpARM64UDIVW,
+		ssa.OpARM64MOD,
+		ssa.OpARM64UMOD,
+		ssa.OpARM64MODW,
+		ssa.OpARM64UMODW,
+		ssa.OpARM64SLL,
+		ssa.OpARM64SRL,
+		ssa.OpARM64SRA,
+		ssa.OpARM64FADDS,
+		ssa.OpARM64FADDD,
+		ssa.OpARM64FSUBS,
+		ssa.OpARM64FSUBD,
+		ssa.OpARM64FMULS,
+		ssa.OpARM64FMULD,
+		ssa.OpARM64FDIVS,
+		ssa.OpARM64FDIVD:
+		r := gc.SSARegNum(v)
+		r1 := gc.SSARegNum(v.Args[0])
+		r2 := gc.SSARegNum(v.Args[1])
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r2
+		p.Reg = r1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	case ssa.OpARM64ADDconst,
+		ssa.OpARM64SUBconst,
+		ssa.OpARM64ANDconst,
+		ssa.OpARM64ORconst,
+		ssa.OpARM64XORconst,
+		ssa.OpARM64BICconst,
+		ssa.OpARM64SLLconst,
+		ssa.OpARM64SRLconst,
+		ssa.OpARM64SRAconst,
+		ssa.OpARM64RORconst,
+		ssa.OpARM64RORWconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64ADDshiftLL,
+		ssa.OpARM64SUBshiftLL,
+		ssa.OpARM64ANDshiftLL,
+		ssa.OpARM64ORshiftLL,
+		ssa.OpARM64XORshiftLL,
+		ssa.OpARM64BICshiftLL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm64.SHIFT_LL, v.AuxInt)
+	case ssa.OpARM64ADDshiftRL,
+		ssa.OpARM64SUBshiftRL,
+		ssa.OpARM64ANDshiftRL,
+		ssa.OpARM64ORshiftRL,
+		ssa.OpARM64XORshiftRL,
+		ssa.OpARM64BICshiftRL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm64.SHIFT_LR, v.AuxInt)
+	case ssa.OpARM64ADDshiftRA,
+		ssa.OpARM64SUBshiftRA,
+		ssa.OpARM64ANDshiftRA,
+		ssa.OpARM64ORshiftRA,
+		ssa.OpARM64XORshiftRA,
+		ssa.OpARM64BICshiftRA:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm64.SHIFT_AR, v.AuxInt)
+	case ssa.OpARM64MOVDconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64FMOVSconst,
+		ssa.OpARM64FMOVDconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_FCONST
+		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64CMP,
+		ssa.OpARM64CMPW,
+		ssa.OpARM64CMN,
+		ssa.OpARM64CMNW,
+		ssa.OpARM64FCMPS,
+		ssa.OpARM64FCMPD:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[1])
+		p.Reg = gc.SSARegNum(v.Args[0])
+	case ssa.OpARM64CMPconst,
+		ssa.OpARM64CMPWconst,
+		ssa.OpARM64CMNconst,
+		ssa.OpARM64CMNWconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.Reg = gc.SSARegNum(v.Args[0])
+	case ssa.OpARM64CMPshiftLL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), 0, arm64.SHIFT_LL, v.AuxInt)
+	case ssa.OpARM64CMPshiftRL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), 0, arm64.SHIFT_LR, v.AuxInt)
+	case ssa.OpARM64CMPshiftRA:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), 0, arm64.SHIFT_AR, v.AuxInt)
+	case ssa.OpARM64MOVDaddr:
+		p := gc.Prog(arm64.AMOVD)
+		p.From.Type = obj.TYPE_ADDR
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+
+		var wantreg string
+		// MOVD $sym+off(base), R
+		// the assembler expands it as the following:
+		// - base is SP: add constant offset to SP (R13)
+		//               when constant is large, tmp register (R11) may be used
+		// - base is SB: load external address from constant pool (use relocation)
+		switch v.Aux.(type) {
+		default:
+			v.Fatalf("aux is of unknown type %T", v.Aux)
+		case *ssa.ExternSymbol:
+			wantreg = "SB"
+			gc.AddAux(&p.From, v)
+		case *ssa.ArgSymbol, *ssa.AutoSymbol:
+			wantreg = "SP"
+			gc.AddAux(&p.From, v)
+		case nil:
+			// No sym, just MOVD $off(SP), R
+			wantreg = "SP"
+			p.From.Reg = arm64.REGSP
+			p.From.Offset = v.AuxInt
+		}
+		if reg := gc.SSAReg(v.Args[0]); reg.Name() != wantreg {
+			v.Fatalf("bad reg %s for symbol type %T, want %s", reg.Name(), v.Aux, wantreg)
+		}
+	case ssa.OpARM64MOVBload,
+		ssa.OpARM64MOVBUload,
+		ssa.OpARM64MOVHload,
+		ssa.OpARM64MOVHUload,
+		ssa.OpARM64MOVWload,
+		ssa.OpARM64MOVWUload,
+		ssa.OpARM64MOVDload,
+		ssa.OpARM64FMOVSload,
+		ssa.OpARM64FMOVDload:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64MOVBstore,
+		ssa.OpARM64MOVHstore,
+		ssa.OpARM64MOVWstore,
+		ssa.OpARM64MOVDstore,
+		ssa.OpARM64FMOVSstore,
+		ssa.OpARM64FMOVDstore:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.To, v)
+	case ssa.OpARM64MOVBstorezero,
+		ssa.OpARM64MOVHstorezero,
+		ssa.OpARM64MOVWstorezero,
+		ssa.OpARM64MOVDstorezero:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = arm64.REGZERO
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.To, v)
+	case ssa.OpARM64MOVBreg,
+		ssa.OpARM64MOVBUreg,
+		ssa.OpARM64MOVHreg,
+		ssa.OpARM64MOVHUreg,
+		ssa.OpARM64MOVWreg,
+		ssa.OpARM64MOVWUreg:
+		a := v.Args[0]
+		for a.Op == ssa.OpCopy || a.Op == ssa.OpARM64MOVDreg {
+			a = a.Args[0]
+		}
+		if a.Op == ssa.OpLoadReg {
+			t := a.Type
+			switch {
+			case v.Op == ssa.OpARM64MOVBreg && t.Size() == 1 && t.IsSigned(),
+				v.Op == ssa.OpARM64MOVBUreg && t.Size() == 1 && !t.IsSigned(),
+				v.Op == ssa.OpARM64MOVHreg && t.Size() == 2 && t.IsSigned(),
+				v.Op == ssa.OpARM64MOVHUreg && t.Size() == 2 && !t.IsSigned(),
+				v.Op == ssa.OpARM64MOVWreg && t.Size() == 4 && t.IsSigned(),
+				v.Op == ssa.OpARM64MOVWUreg && t.Size() == 4 && !t.IsSigned():
+				// arg is a proper-typed load, already zero/sign-extended, don't extend again
+				if gc.SSARegNum(v) == gc.SSARegNum(v.Args[0]) {
+					return
+				}
+				p := gc.Prog(arm64.AMOVD)
+				p.From.Type = obj.TYPE_REG
+				p.From.Reg = gc.SSARegNum(v.Args[0])
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = gc.SSARegNum(v)
+				return
+			default:
+			}
+		}
+		fallthrough
+	case ssa.OpARM64MVN,
+		ssa.OpARM64NEG,
+		ssa.OpARM64FNEGS,
+		ssa.OpARM64FNEGD,
+		ssa.OpARM64FSQRTD,
+		ssa.OpARM64FCVTZSSW,
+		ssa.OpARM64FCVTZSDW,
+		ssa.OpARM64FCVTZUSW,
+		ssa.OpARM64FCVTZUDW,
+		ssa.OpARM64FCVTZSS,
+		ssa.OpARM64FCVTZSD,
+		ssa.OpARM64FCVTZUS,
+		ssa.OpARM64FCVTZUD,
+		ssa.OpARM64SCVTFWS,
+		ssa.OpARM64SCVTFWD,
+		ssa.OpARM64SCVTFS,
+		ssa.OpARM64SCVTFD,
+		ssa.OpARM64UCVTFWS,
+		ssa.OpARM64UCVTFWD,
+		ssa.OpARM64UCVTFS,
+		ssa.OpARM64UCVTFD,
+		ssa.OpARM64FCVTSD,
+		ssa.OpARM64FCVTDS:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64CSELULT,
+		ssa.OpARM64CSELULT0:
+		r1 := int16(arm64.REGZERO)
+		if v.Op == ssa.OpARM64CSELULT {
+			r1 = gc.SSARegNum(v.Args[1])
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG // assembler encodes conditional bits in Reg
+		p.From.Reg = arm64.COND_LO
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.From3 = &obj.Addr{Type: obj.TYPE_REG, Reg: r1}
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64DUFFZERO:
+		// runtime.duffzero expects start address - 8 in R16
+		p := gc.Prog(arm64.ASUB)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 8
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REG_R16
+		p = gc.Prog(obj.ADUFFZERO)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
+		p.To.Offset = v.AuxInt
+	case ssa.OpARM64LoweredZero:
+		// MOVD.P	ZR, 8(R16)
+		// CMP	Rarg1, R16
+		// BLE	-2(PC)
+		// arg1 is the address of the last element to zero
+		// auxint is alignment
+		var sz int64
+		var mov obj.As
+		switch {
+		case v.AuxInt%8 == 0:
+			sz = 8
+			mov = arm64.AMOVD
+		case v.AuxInt%4 == 0:
+			sz = 4
+			mov = arm64.AMOVW
+		case v.AuxInt%2 == 0:
+			sz = 2
+			mov = arm64.AMOVH
+		default:
+			sz = 1
+			mov = arm64.AMOVB
+		}
+		p := gc.Prog(mov)
+		p.Scond = arm64.C_XPOST
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = arm64.REGZERO
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = arm64.REG_R16
+		p.To.Offset = sz
+		p2 := gc.Prog(arm64.ACMP)
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = gc.SSARegNum(v.Args[1])
+		p2.Reg = arm64.REG_R16
+		p3 := gc.Prog(arm64.ABLE)
+		p3.To.Type = obj.TYPE_BRANCH
+		gc.Patch(p3, p)
+	case ssa.OpARM64LoweredMove:
+		// MOVD.P	8(R16), Rtmp
+		// MOVD.P	Rtmp, 8(R17)
+		// CMP	Rarg2, R16
+		// BLE	-3(PC)
+		// arg2 is the address of the last element of src
+		// auxint is alignment
+		var sz int64
+		var mov obj.As
+		switch {
+		case v.AuxInt%8 == 0:
+			sz = 8
+			mov = arm64.AMOVD
+		case v.AuxInt%4 == 0:
+			sz = 4
+			mov = arm64.AMOVW
+		case v.AuxInt%2 == 0:
+			sz = 2
+			mov = arm64.AMOVH
+		default:
+			sz = 1
+			mov = arm64.AMOVB
+		}
+		p := gc.Prog(mov)
+		p.Scond = arm64.C_XPOST
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = arm64.REG_R16
+		p.From.Offset = sz
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+		p2 := gc.Prog(mov)
+		p2.Scond = arm64.C_XPOST
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = arm64.REGTMP
+		p2.To.Type = obj.TYPE_MEM
+		p2.To.Reg = arm64.REG_R17
+		p2.To.Offset = sz
+		p3 := gc.Prog(arm64.ACMP)
+		p3.From.Type = obj.TYPE_REG
+		p3.From.Reg = gc.SSARegNum(v.Args[2])
+		p3.Reg = arm64.REG_R16
+		p4 := gc.Prog(arm64.ABLE)
+		p4.To.Type = obj.TYPE_BRANCH
+		gc.Patch(p4, p)
+	case ssa.OpARM64CALLstatic:
+		if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym {
+			// Deferred calls will appear to be returning to
+			// the CALL deferreturn(SB) that we are about to emit.
+			// However, the stack trace code will show the line
+			// of the instruction byte before the return PC.
+			// To avoid that being an unrelated instruction,
+			// insert an actual hardware NOP that will have the right line number.
+			// This is different from obj.ANOP, which is a virtual no-op
+			// that doesn't make it into the instruction stream.
+			ginsnop()
+		}
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym))
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLclosure:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Offset = 0
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLdefer:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Deferproc.Sym)
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLgo:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Newproc.Sym)
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLinter:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Offset = 0
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64LoweredNilCheck:
+		// Optimization - if the subsequent block has a load or store
+		// at the same address, we don't need to issue this instruction.
+		mem := v.Args[1]
+		for _, w := range v.Block.Succs[0].Block().Values {
+			if w.Op == ssa.OpPhi {
+				if w.Type.IsMemory() {
+					mem = w
+				}
+				continue
+			}
+			if len(w.Args) == 0 || !w.Args[len(w.Args)-1].Type.IsMemory() {
+				// w doesn't use a store - can't be a memory op.
+				continue
+			}
+			if w.Args[len(w.Args)-1] != mem {
+				v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w)
+			}
+			switch w.Op {
+			case ssa.OpARM64MOVBload, ssa.OpARM64MOVBUload, ssa.OpARM64MOVHload, ssa.OpARM64MOVHUload,
+				ssa.OpARM64MOVWload, ssa.OpARM64MOVWUload, ssa.OpARM64MOVDload,
+				ssa.OpARM64FMOVSload, ssa.OpARM64FMOVDload,
+				ssa.OpARM64MOVBstore, ssa.OpARM64MOVHstore, ssa.OpARM64MOVWstore, ssa.OpARM64MOVDstore,
+				ssa.OpARM64FMOVSstore, ssa.OpARM64FMOVDstore:
+				// arg0 is ptr, auxint is offset
+				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			case ssa.OpARM64DUFFZERO, ssa.OpARM64LoweredZero:
+				// arg0 is ptr
+				if w.Args[0] == v.Args[0] {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			case ssa.OpARM64LoweredMove:
+				// arg0 is dst ptr, arg1 is src ptr
+				if w.Args[0] == v.Args[0] || w.Args[1] == v.Args[0] {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			default:
+			}
+			if w.Type.IsMemory() {
+				if w.Op == ssa.OpVarDef || w.Op == ssa.OpVarKill || w.Op == ssa.OpVarLive {
+					// these ops are OK
+					mem = w
+					continue
+				}
+				// We can't delay the nil check past the next store.
+				break
+			}
+		}
+		// Issue a load which will fault if arg is nil.
+		p := gc.Prog(arm64.AMOVB)
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+		if gc.Debug_checknil != 0 && v.Line > 1 { // v.Line==1 in generated wrappers
+			gc.Warnl(v.Line, "generated nil check")
+		}
+	case ssa.OpVarDef:
+		gc.Gvardef(v.Aux.(*gc.Node))
+	case ssa.OpVarKill:
+		gc.Gvarkill(v.Aux.(*gc.Node))
+	case ssa.OpVarLive:
+		gc.Gvarlive(v.Aux.(*gc.Node))
+	case ssa.OpKeepAlive:
+		if !v.Args[0].Type.IsPtrShaped() {
+			v.Fatalf("keeping non-pointer alive %v", v.Args[0])
+		}
+		n, off := gc.AutoVar(v.Args[0])
+		if n == nil {
+			v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0])
+		}
+		if off != 0 {
+			v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off)
+		}
+		gc.Gvarlive(n)
+	case ssa.OpARM64Equal,
+		ssa.OpARM64NotEqual,
+		ssa.OpARM64LessThan,
+		ssa.OpARM64LessEqual,
+		ssa.OpARM64GreaterThan,
+		ssa.OpARM64GreaterEqual,
+		ssa.OpARM64LessThanU,
+		ssa.OpARM64LessEqualU,
+		ssa.OpARM64GreaterThanU,
+		ssa.OpARM64GreaterEqualU:
+		// generate boolean values using CSET
+		p := gc.Prog(arm64.ACSET)
+		p.From.Type = obj.TYPE_REG // assembler encodes conditional bits in Reg
+		p.From.Reg = condBits[v.Op]
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpSelect0, ssa.OpSelect1:
+		// nothing to do
+	case ssa.OpARM64LoweredGetClosurePtr:
+		// Closure pointer is R26 (arm64.REGCTXT).
+		gc.CheckLoweredGetClosurePtr(v)
+	case ssa.OpARM64FlagEQ,
+		ssa.OpARM64FlagLT_ULT,
+		ssa.OpARM64FlagLT_UGT,
+		ssa.OpARM64FlagGT_ULT,
+		ssa.OpARM64FlagGT_UGT:
+		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
+	case ssa.OpARM64InvertFlags:
+		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
+	default:
+		v.Unimplementedf("genValue not implemented: %s", v.LongString())
+	}
+}
+
+var condBits = map[ssa.Op]int16{
+	ssa.OpARM64Equal:         arm64.COND_EQ,
+	ssa.OpARM64NotEqual:      arm64.COND_NE,
+	ssa.OpARM64LessThan:      arm64.COND_LT,
+	ssa.OpARM64LessThanU:     arm64.COND_LO,
+	ssa.OpARM64LessEqual:     arm64.COND_LE,
+	ssa.OpARM64LessEqualU:    arm64.COND_LS,
+	ssa.OpARM64GreaterThan:   arm64.COND_GT,
+	ssa.OpARM64GreaterThanU:  arm64.COND_HI,
+	ssa.OpARM64GreaterEqual:  arm64.COND_GE,
+	ssa.OpARM64GreaterEqualU: arm64.COND_HS,
+}
+
+var blockJump = map[ssa.BlockKind]struct {
+	asm, invasm obj.As
+}{
+	ssa.BlockARM64EQ:  {arm64.ABEQ, arm64.ABNE},
+	ssa.BlockARM64NE:  {arm64.ABNE, arm64.ABEQ},
+	ssa.BlockARM64LT:  {arm64.ABLT, arm64.ABGE},
+	ssa.BlockARM64GE:  {arm64.ABGE, arm64.ABLT},
+	ssa.BlockARM64LE:  {arm64.ABLE, arm64.ABGT},
+	ssa.BlockARM64GT:  {arm64.ABGT, arm64.ABLE},
+	ssa.BlockARM64ULT: {arm64.ABLO, arm64.ABHS},
+	ssa.BlockARM64UGE: {arm64.ABHS, arm64.ABLO},
+	ssa.BlockARM64UGT: {arm64.ABHI, arm64.ABLS},
+	ssa.BlockARM64ULE: {arm64.ABLS, arm64.ABHI},
+}
+
+func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
+	s.SetLineno(b.Line)
+
+	switch b.Kind {
+	case ssa.BlockPlain, ssa.BlockCall, ssa.BlockCheck:
+		if b.Succs[0].Block() != next {
+			p := gc.Prog(obj.AJMP)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		}
+
+	case ssa.BlockDefer:
+		// defer returns in R0:
+		// 0 if we should continue executing
+		// 1 if we should jump to deferreturn call
+		p := gc.Prog(arm64.ACMP)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 0
+		p.Reg = arm64.REG_R0
+		p = gc.Prog(arm64.ABNE)
+		p.To.Type = obj.TYPE_BRANCH
+		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
+		if b.Succs[0].Block() != next {
+			p := gc.Prog(obj.AJMP)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		}
+
+	case ssa.BlockExit:
+		gc.Prog(obj.AUNDEF) // tell plive.go that we never reach here
+
+	case ssa.BlockRet:
+		gc.Prog(obj.ARET)
+
+	case ssa.BlockRetJmp:
+		p := gc.Prog(obj.ARET)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(b.Aux.(*gc.Sym))
+
+	case ssa.BlockARM64EQ, ssa.BlockARM64NE,
+		ssa.BlockARM64LT, ssa.BlockARM64GE,
+		ssa.BlockARM64LE, ssa.BlockARM64GT,
+		ssa.BlockARM64ULT, ssa.BlockARM64UGT,
+		ssa.BlockARM64ULE, ssa.BlockARM64UGE:
+		jmp := blockJump[b.Kind]
+		var p *obj.Prog
+		switch next {
+		case b.Succs[0].Block():
+			p = gc.Prog(jmp.invasm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
+		case b.Succs[1].Block():
+			p = gc.Prog(jmp.asm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		default:
+			p = gc.Prog(jmp.asm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+			q := gc.Prog(obj.AJMP)
+			q.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
+		}
+
+	default:
+		b.Unimplementedf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
+	}
+}
--- a/src/cmd/compile/internal/big/arith_test.go
+++ b/src/cmd/compile/internal/big/arith_test.go
@@ -5,6 +5,7 @@
 package big

 import (
+	"fmt"
 	"math/rand"
 	"testing"
 )
@@ -118,28 +119,22 @@ func rndV(n int) []Word {
 	return v
 }

-func benchmarkFunVV(b *testing.B, f funVV, n int) {
-	x := rndV(n)
-	y := rndV(n)
-	z := make([]Word, n)
-	b.SetBytes(int64(n * _W))
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		f(z, x, y)
+var benchSizes = []int{1, 2, 3, 4, 5, 1e1, 1e2, 1e3, 1e4, 1e5}
+
+func BenchmarkAddVV(b *testing.B) {
+	for _, n := range benchSizes {
+		x := rndV(n)
+		y := rndV(n)
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _W))
+			for i := 0; i < b.N; i++ {
+				addVV(z, x, y)
+			}
+		})
 	}
 }

-func BenchmarkAddVV_1(b *testing.B)   { benchmarkFunVV(b, addVV, 1) }
-func BenchmarkAddVV_2(b *testing.B)   { benchmarkFunVV(b, addVV, 2) }
-func BenchmarkAddVV_3(b *testing.B)   { benchmarkFunVV(b, addVV, 3) }
-func BenchmarkAddVV_4(b *testing.B)   { benchmarkFunVV(b, addVV, 4) }
-func BenchmarkAddVV_5(b *testing.B)   { benchmarkFunVV(b, addVV, 5) }
-func BenchmarkAddVV_1e1(b *testing.B) { benchmarkFunVV(b, addVV, 1e1) }
-func BenchmarkAddVV_1e2(b *testing.B) { benchmarkFunVV(b, addVV, 1e2) }
-func BenchmarkAddVV_1e3(b *testing.B) { benchmarkFunVV(b, addVV, 1e3) }
-func BenchmarkAddVV_1e4(b *testing.B) { benchmarkFunVV(b, addVV, 1e4) }
-func BenchmarkAddVV_1e5(b *testing.B) { benchmarkFunVV(b, addVV, 1e5) }
-
 type funVW func(z, x []Word, y Word) (c Word)
 type argVW struct {
 	z, x nat
@@ -236,28 +231,20 @@ func TestFunVW(t *testing.T) {
 	}
 }

-func benchmarkFunVW(b *testing.B, f funVW, n int) {
-	x := rndV(n)
-	y := rndW()
-	z := make([]Word, n)
-	b.SetBytes(int64(n * _S))
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		f(z, x, y)
+func BenchmarkAddVW(b *testing.B) {
+	for _, n := range benchSizes {
+		x := rndV(n)
+		y := rndW()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _S))
+			for i := 0; i < b.N; i++ {
+				addVW(z, x, y)
+			}
+		})
 	}
 }

-func BenchmarkAddVW_1(b *testing.B)   { benchmarkFunVW(b, addVW, 1) }
-func BenchmarkAddVW_2(b *testing.B)   { benchmarkFunVW(b, addVW, 2) }
-func BenchmarkAddVW_3(b *testing.B)   { benchmarkFunVW(b, addVW, 3) }
-func BenchmarkAddVW_4(b *testing.B)   { benchmarkFunVW(b, addVW, 4) }
-func BenchmarkAddVW_5(b *testing.B)   { benchmarkFunVW(b, addVW, 5) }
-func BenchmarkAddVW_1e1(b *testing.B) { benchmarkFunVW(b, addVW, 1e1) }
-func BenchmarkAddVW_1e2(b *testing.B) { benchmarkFunVW(b, addVW, 1e2) }
-func BenchmarkAddVW_1e3(b *testing.B) { benchmarkFunVW(b, addVW, 1e3) }
-func BenchmarkAddVW_1e4(b *testing.B) { benchmarkFunVW(b, addVW, 1e4) }
-func BenchmarkAddVW_1e5(b *testing.B) { benchmarkFunVW(b, addVW, 1e5) }
-
 type funVWW func(z, x []Word, y, r Word) (c Word)
 type argVWW struct {
 	z, x nat
@@ -382,28 +369,20 @@ func TestMulAddWWW(t *testing.T) {
 	}
 }

-func benchmarkAddMulVVW(b *testing.B, n int) {
-	x := rndV(n)
-	y := rndW()
-	z := make([]Word, n)
-	b.SetBytes(int64(n * _W))
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		addMulVVW(z, x, y)
+func BenchmarkAddMulVVW(b *testing.B) {
+	for _, n := range benchSizes {
+		x := rndV(n)
+		y := rndW()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _W))
+			for i := 0; i < b.N; i++ {
+				addMulVVW(z, x, y)
+			}
+		})
 	}
 }

-func BenchmarkAddMulVVW_1(b *testing.B)   { benchmarkAddMulVVW(b, 1) }
-func BenchmarkAddMulVVW_2(b *testing.B)   { benchmarkAddMulVVW(b, 2) }
-func BenchmarkAddMulVVW_3(b *testing.B)   { benchmarkAddMulVVW(b, 3) }
-func BenchmarkAddMulVVW_4(b *testing.B)   { benchmarkAddMulVVW(b, 4) }
-func BenchmarkAddMulVVW_5(b *testing.B)   { benchmarkAddMulVVW(b, 5) }
-func BenchmarkAddMulVVW_1e1(b *testing.B) { benchmarkAddMulVVW(b, 1e1) }
-func BenchmarkAddMulVVW_1e2(b *testing.B) { benchmarkAddMulVVW(b, 1e2) }
-func BenchmarkAddMulVVW_1e3(b *testing.B) { benchmarkAddMulVVW(b, 1e3) }
-func BenchmarkAddMulVVW_1e4(b *testing.B) { benchmarkAddMulVVW(b, 1e4) }
-func BenchmarkAddMulVVW_1e5(b *testing.B) { benchmarkAddMulVVW(b, 1e5) }
-
 func testWordBitLen(t *testing.T, fname string, f func(Word) int) {
 	for i := 0; i <= _W; i++ {
 		x := Word(1) << uint(i-1) // i == 0 => x == 0
@@ -420,23 +399,15 @@ func TestWordBitLen(t *testing.T) {
 }

 // runs b.N iterations of bitLen called on a Word containing (1 << nbits)-1.
-func benchmarkBitLenN(b *testing.B, nbits uint) {
-	testword := Word((uint64(1) << nbits) - 1)
-	for i := 0; i < b.N; i++ {
-		bitLen(testword)
+func BenchmarkBitLen(b *testing.B) {
+	// Individual bitLen tests. Numbers chosen to examine both sides
+	// of powers-of-two boundaries.
+	for _, nbits := range []uint{0, 1, 2, 3, 4, 5, 8, 9, 16, 17, 31} {
+		testword := Word((uint64(1) << nbits) - 1)
+		b.Run(fmt.Sprint(nbits), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				bitLen(testword)
+			}
+		})
 	}
 }
-
-// Individual bitLen tests. Numbers chosen to examine both sides
-// of powers-of-two boundaries.
-func BenchmarkBitLen0(b *testing.B)  { benchmarkBitLenN(b, 0) }
-func BenchmarkBitLen1(b *testing.B)  { benchmarkBitLenN(b, 1) }
-func BenchmarkBitLen2(b *testing.B)  { benchmarkBitLenN(b, 2) }
-func BenchmarkBitLen3(b *testing.B)  { benchmarkBitLenN(b, 3) }
-func BenchmarkBitLen4(b *testing.B)  { benchmarkBitLenN(b, 4) }
-func BenchmarkBitLen5(b *testing.B)  { benchmarkBitLenN(b, 5) }
-func BenchmarkBitLen8(b *testing.B)  { benchmarkBitLenN(b, 8) }
-func BenchmarkBitLen9(b *testing.B)  { benchmarkBitLenN(b, 9) }
-func BenchmarkBitLen16(b *testing.B) { benchmarkBitLenN(b, 16) }
-func BenchmarkBitLen17(b *testing.B) { benchmarkBitLenN(b, 17) }
-func BenchmarkBitLen31(b *testing.B) { benchmarkBitLenN(b, 31) }
--- a/src/cmd/compile/internal/big/decimal_test.go
+++ b/src/cmd/compile/internal/big/decimal_test.go
@@ -105,12 +105,14 @@ func TestDecimalRounding(t *testing.T) {
 	}
 }

+var sink string
+
 func BenchmarkDecimalConversion(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		for shift := -100; shift <= +100; shift++ {
 			var d decimal
 			d.init(natOne, shift)
-			d.String()
+			sink = d.String()
 		}
 	}
 }
--- a/src/cmd/compile/internal/big/doc.go
+++ b/src/cmd/compile/internal/big/doc.go
@@ -0,0 +1,99 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package big implements arbitrary-precision arithmetic (big numbers).
+The following numeric types are supported:
+
+	Int    signed integers
+	Rat    rational numbers
+	Float  floating-point numbers
+
+The zero value for an Int, Rat, or Float correspond to 0. Thus, new
+values can be declared in the usual ways and denote 0 without further
+initialization:
+
+	var x Int        // &x is an *Int of value 0
+	var r = &Rat{}   // r is a *Rat of value 0
+	y := new(Float)  // y is a *Float of value 0
+
+Alternatively, new values can be allocated and initialized with factory
+functions of the form:
+
+	func NewT(v V) *T
+
+For instance, NewInt(x) returns an *Int set to the value of the int64
+argument x, NewRat(a, b) returns a *Rat set to the fraction a/b where
+a and b are int64 values, and NewFloat(f) returns a *Float initialized
+to the float64 argument f. More flexibility is provided with explicit
+setters, for instance:
+
+	var z1 Int
+	z1.SetUint64(123)                 // z1 := 123
+	z2 := new(Rat).SetFloat64(1.2)    // z2 := 6/5
+	z3 := new(Float).SetInt(z1)       // z3 := 123.0
+
+Setters, numeric operations and predicates are represented as methods of
+the form:
+
+	func (z *T) SetV(v V) *T          // z = v
+	func (z *T) Unary(x *T) *T        // z = unary x
+	func (z *T) Binary(x, y *T) *T    // z = x binary y
+	func (x *T) Pred() P              // p = pred(x)
+
+with T one of Int, Rat, or Float. For unary and binary operations, the
+result is the receiver (usually named z in that case; see below); if it
+is one of the operands x or y it may be safely overwritten (and its memory
+reused).
+
+Arithmetic expressions are typically written as a sequence of individual
+method calls, with each call corresponding to an operation. The receiver
+denotes the result and the method arguments are the operation's operands.
+For instance, given three *Int values a, b and c, the invocation
+
+	c.Add(a, b)
+
+computes the sum a + b and stores the result in c, overwriting whatever
+value was held in c before. Unless specified otherwise, operations permit
+aliasing of parameters, so it is perfectly ok to write
+
+	sum.Add(sum, x)
+
+to accumulate values x in a sum.
+
+(By always passing in a result value via the receiver, memory use can be
+much better controlled. Instead of having to allocate new memory for each
+result, an operation can reuse the space allocated for the result value,
+and overwrite that value with the new result in the process.)
+
+Notational convention: Incoming method parameters (including the receiver)
+are named consistently in the API to clarify their use. Incoming operands
+are usually named x, y, a, b, and so on, but never z. A parameter specifying
+the result is named z (typically the receiver).
+
+For instance, the arguments for (*Int).Add are named x and y, and because
+the receiver specifies the result destination, it is called z:
+
+	func (z *Int) Add(x, y *Int) *Int
+
+Methods of this form typically return the incoming receiver as well, to
+enable simple call chaining.
+
+Methods which don't require a result value to be passed in (for instance,
+Int.Sign), simply return the result. In this case, the receiver is typically
+the first operand, named x:
+
+	func (x *Int) Sign() int
+
+Various methods support conversions between strings and corresponding
+numeric values, and vice versa: *Int, *Rat, and *Float values implement
+the Stringer interface for a (default) string representation of the value,
+but also provide SetString methods to initialize a value from a string in
+a variety of supported formats (see the respective SetString documentation).
+
+Finally, *Int, *Rat, and *Float satisfy the fmt package's Scanner interface
+for scanning and (except for *Rat) the Formatter interface for formatted
+printing.
+*/
+package big
--- a/src/cmd/compile/internal/big/float.go
+++ b/src/cmd/compile/internal/big/float.go
@@ -1008,9 +1008,9 @@ func (x *Float) Float64() (float64, Accuracy) {
 		if r.form == inf || e > emax {
 			// overflow
 			if x.neg {
-				return float64(math.Inf(-1)), Below
+				return math.Inf(-1), Below
 			}
-			return float64(math.Inf(+1)), Above
+			return math.Inf(+1), Above
 		}
 		// e <= emax

--- a/src/cmd/compile/internal/big/floatconv_test.go
+++ b/src/cmd/compile/internal/big/floatconv_test.go
@@ -290,6 +290,11 @@ func TestFloat64Text(t *testing.T) {
 		// Issue 2625.
 		{383260575764816448, 'f', 0, "383260575764816448"},
 		{383260575764816448, 'g', -1, "3.8326057576481645e+17"},
+
+		// Issue 15918.
+		{1, 'f', -10, "1"},
+		{1, 'f', -11, "1"},
+		{1, 'f', -12, "1"},
 	} {
 		// The test cases are from the strconv package which tests float64 values.
 		// When formatting values with prec = -1 (shortest representation),
--- a/src/cmd/compile/internal/big/floatexample_test.go
+++ b/src/cmd/compile/internal/big/floatexample_test.go
@@ -11,7 +11,7 @@ import (
 )

 func ExampleFloat_Add() {
-	// Operating on numbers of different precision.
+	// Operate on numbers of different precision.
 	var x, y, z big.Float
 	x.SetInt64(1000)          // x is automatically set to 64bit precision
 	y.SetFloat64(2.718281828) // y is automatically set to 53bit precision
@@ -26,8 +26,8 @@ func ExampleFloat_Add() {
 	// z = 1002.718282 (0x.faadf854p+10, prec = 32, acc = Below)
 }

-func Example_Shift() {
-	// Implementing Float "shift" by modifying the (binary) exponents directly.
+func ExampleFloat_shift() {
+	// Implement Float "shift" by modifying the (binary) exponents directly.
 	for s := -5; s <= 5; s++ {
 		x := big.NewFloat(0.5)
 		x.SetMantExp(x, x.MantExp(nil)+s) // shift x by s
--- a/src/cmd/compile/internal/big/floatmarsh.go
+++ b/src/cmd/compile/internal/big/floatmarsh.go
@@ -6,7 +6,94 @@

 package big

-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+)
+
+// Gob codec version. Permits backward-compatible changes to the encoding.
+const floatGobVersion byte = 1
+
+// GobEncode implements the gob.GobEncoder interface.
+// The Float value and all its attributes (precision,
+// rounding mode, accuracy) are marshalled.
+func (x *Float) GobEncode() ([]byte, error) {
+	if x == nil {
+		return nil, nil
+	}
+
+	// determine max. space (bytes) required for encoding
+	sz := 1 + 1 + 4 // version + mode|acc|form|neg (3+2+2+1bit) + prec
+	n := 0          // number of mantissa words
+	if x.form == finite {
+		// add space for mantissa and exponent
+		n = int((x.prec + (_W - 1)) / _W) // required mantissa length in words for given precision
+		// actual mantissa slice could be shorter (trailing 0's) or longer (unused bits):
+		// - if shorter, only encode the words present
+		// - if longer, cut off unused words when encoding in bytes
+		//   (in practice, this should never happen since rounding
+		//   takes care of it, but be safe and do it always)
+		if len(x.mant) < n {
+			n = len(x.mant)
+		}
+		// len(x.mant) >= n
+		sz += 4 + n*_S // exp + mant
+	}
+	buf := make([]byte, sz)
+
+	buf[0] = floatGobVersion
+	b := byte(x.mode&7)<<5 | byte((x.acc+1)&3)<<3 | byte(x.form&3)<<1
+	if x.neg {
+		b |= 1
+	}
+	buf[1] = b
+	binary.BigEndian.PutUint32(buf[2:], x.prec)
+
+	if x.form == finite {
+		binary.BigEndian.PutUint32(buf[6:], uint32(x.exp))
+		x.mant[len(x.mant)-n:].bytes(buf[10:]) // cut off unused trailing words
+	}
+
+	return buf, nil
+}
+
+// GobDecode implements the gob.GobDecoder interface.
+// The result is rounded per the precision and rounding mode of
+// z unless z's precision is 0, in which case z is set exactly
+// to the decoded value.
+func (z *Float) GobDecode(buf []byte) error {
+	if len(buf) == 0 {
+		// Other side sent a nil or default value.
+		*z = Float{}
+		return nil
+	}
+
+	if buf[0] != floatGobVersion {
+		return fmt.Errorf("Float.GobDecode: encoding version %d not supported", buf[0])
+	}
+
+	oldPrec := z.prec
+	oldMode := z.mode
+
+	b := buf[1]
+	z.mode = RoundingMode((b >> 5) & 7)
+	z.acc = Accuracy((b>>3)&3) - 1
+	z.form = form((b >> 1) & 3)
+	z.neg = b&1 != 0
+	z.prec = binary.BigEndian.Uint32(buf[2:])
+
+	if z.form == finite {
+		z.exp = int32(binary.BigEndian.Uint32(buf[6:]))
+		z.mant = z.mant.setBytes(buf[10:])
+	}
+
+	if oldPrec != 0 {
+		z.mode = oldMode
+		z.SetPrec(uint(oldPrec))
+	}
+
+	return nil
+}

 // MarshalText implements the encoding.TextMarshaler interface.
 // Only the Float value is marshaled (in full precision), other
--- a/src/cmd/compile/internal/big/floatmarsh_test.go
+++ b/src/cmd/compile/internal/big/floatmarsh_test.go
@@ -5,7 +5,10 @@
 package big

 import (
+	"bytes"
+	"encoding/gob"
 	"encoding/json"
+	"io"
 	"testing"
 )

@@ -23,6 +26,85 @@ var floatVals = []string{
 	"Inf",
 }

+func TestFloatGobEncoding(t *testing.T) {
+	var medium bytes.Buffer
+	enc := gob.NewEncoder(&medium)
+	dec := gob.NewDecoder(&medium)
+	for _, test := range floatVals {
+		for _, sign := range []string{"", "+", "-"} {
+			for _, prec := range []uint{0, 1, 2, 10, 53, 64, 100, 1000} {
+				for _, mode := range []RoundingMode{ToNearestEven, ToNearestAway, ToZero, AwayFromZero, ToNegativeInf, ToPositiveInf} {
+					medium.Reset() // empty buffer for each test case (in case of failures)
+					x := sign + test
+
+					var tx Float
+					_, _, err := tx.SetPrec(prec).SetMode(mode).Parse(x, 0)
+					if err != nil {
+						t.Errorf("parsing of %s (%dbits, %v) failed (invalid test case): %v", x, prec, mode, err)
+						continue
+					}
+
+					// If tx was set to prec == 0, tx.Parse(x, 0) assumes precision 64. Correct it.
+					if prec == 0 {
+						tx.SetPrec(0)
+					}
+
+					if err := enc.Encode(&tx); err != nil {
+						t.Errorf("encoding of %v (%dbits, %v) failed: %v", &tx, prec, mode, err)
+						continue
+					}
+
+					var rx Float
+					if err := dec.Decode(&rx); err != nil {
+						t.Errorf("decoding of %v (%dbits, %v) failed: %v", &tx, prec, mode, err)
+						continue
+					}
+
+					if rx.Cmp(&tx) != 0 {
+						t.Errorf("transmission of %s failed: got %s want %s", x, rx.String(), tx.String())
+						continue
+					}
+
+					if rx.Prec() != prec {
+						t.Errorf("transmission of %s's prec failed: got %d want %d", x, rx.Prec(), prec)
+					}
+
+					if rx.Mode() != mode {
+						t.Errorf("transmission of %s's mode failed: got %s want %s", x, rx.Mode(), mode)
+					}
+
+					if rx.Acc() != tx.Acc() {
+						t.Errorf("transmission of %s's accuracy failed: got %s want %s", x, rx.Acc(), tx.Acc())
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestFloatCorruptGob(t *testing.T) {
+	var buf bytes.Buffer
+	tx := NewFloat(4 / 3).SetPrec(1000).SetMode(ToPositiveInf)
+	if err := gob.NewEncoder(&buf).Encode(tx); err != nil {
+		t.Fatal(err)
+	}
+	b := buf.Bytes()
+
+	var rx Float
+	if err := gob.NewDecoder(bytes.NewReader(b)).Decode(&rx); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := gob.NewDecoder(bytes.NewReader(b[:10])).Decode(&rx); err != io.ErrUnexpectedEOF {
+		t.Errorf("got %v want EOF", err)
+	}
+
+	b[1] = 0
+	if err := gob.NewDecoder(bytes.NewReader(b)).Decode(&rx); err == nil {
+		t.Fatal("got nil want version error")
+	}
+}
+
 func TestFloatJSONEncoding(t *testing.T) {
 	for _, test := range floatVals {
 		for _, sign := range []string{"", "+", "-"} {
--- a/src/cmd/compile/internal/big/ftoa.go
+++ b/src/cmd/compile/internal/big/ftoa.go
@@ -41,8 +41,11 @@ import (
 // x.Prec() mantissa bits.
 // The prec value is ignored for the 'b' or 'p' format.
 func (x *Float) Text(format byte, prec int) string {
-	const extra = 10 // TODO(gri) determine a good/better value here
-	return string(x.Append(make([]byte, 0, prec+extra), format, prec))
+	cap := 10 // TODO(gri) determine a good/better value here
+	if prec > 0 {
+		cap += prec
+	}
+	return string(x.Append(make([]byte, 0, cap), format, prec))
 }

 // String formats x like x.Text('g', 10).
--- a/src/cmd/compile/internal/big/gcd_test.go
+++ b/src/cmd/compile/internal/big/gcd_test.go
@@ -20,13 +20,27 @@ func randInt(r *rand.Rand, size uint) *Int {
 }

 func runGCD(b *testing.B, aSize, bSize uint) {
+	b.Run("WithoutXY", func(b *testing.B) {
+		runGCDExt(b, aSize, bSize, false)
+	})
+	b.Run("WithXY", func(b *testing.B) {
+		runGCDExt(b, aSize, bSize, true)
+	})
+}
+
+func runGCDExt(b *testing.B, aSize, bSize uint, calcXY bool) {
 	b.StopTimer()
 	var r = rand.New(rand.NewSource(1234))
 	aa := randInt(r, aSize)
 	bb := randInt(r, bSize)
+	var x, y *Int
+	if calcXY {
+		x = new(Int)
+		y = new(Int)
+	}
 	b.StartTimer()
 	for i := 0; i < b.N; i++ {
-		new(Int).GCD(nil, nil, aa, bb)
+		new(Int).GCD(x, y, aa, bb)
 	}
 }

--- a/src/cmd/compile/internal/big/int.go
+++ b/src/cmd/compile/internal/big/int.go
@@ -459,11 +459,11 @@ func (z *Int) GCD(x, y, a, b *Int) *Int {
 	q := new(Int)
 	temp := new(Int)

+	r := new(Int)
 	for len(B.abs) > 0 {
-		r := new(Int)
 		q, r = q.QuoRem(A, B, r)

-		A, B = B, r
+		A, B, r = B, r, A

 		temp.Set(X)
 		X.Mul(X, q)
--- a/src/cmd/compile/internal/big/nat.go
+++ b/src/cmd/compile/internal/big/nat.go
@@ -8,7 +8,10 @@

 package big

-import "math/rand"
+import (
+	"math/rand"
+	"sync"
+)

 // An unsigned integer x of the form
 //
@@ -539,6 +542,21 @@ func (z nat) div(z2, u, v nat) (q, r nat) {
 	return
 }

+// getNat returns a nat of len n. The contents may not be zero.
+func getNat(n int) nat {
+	var z nat
+	if v := natPool.Get(); v != nil {
+		z = v.(nat)
+	}
+	return z.make(n)
+}
+
+func putNat(x nat) {
+	natPool.Put(x)
+}
+
+var natPool sync.Pool
+
 // q = (uIn-r)/v, with 0 <= r < y
 // Uses z as storage for q, and u as storage for r if possible.
 // See Knuth, Volume 2, section 4.3.1, Algorithm D.
@@ -557,7 +575,7 @@ func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
 	}
 	q = z.make(m + 1)

-	qhatv := make(nat, n+1)
+	qhatv := getNat(n + 1)
 	if alias(u, uIn) || alias(u, v) {
 		u = nil // u is an alias for uIn or v - cannot reuse
 	}
@@ -565,10 +583,11 @@ func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
 	u.clear() // TODO(gri) no need to clear if we allocated a new u

 	// D1.
+	var v1 nat
 	shift := nlz(v[n-1])
 	if shift > 0 {
 		// do not modify v, it may be used by another goroutine simultaneously
-		v1 := make(nat, n)
+		v1 = getNat(n)
 		shlVU(v1, v, shift)
 		v = v1
 	}
@@ -609,6 +628,10 @@ func (z nat) divLarge(u, uIn, v nat) (q, r nat) {

 		q[j] = qhat
 	}
+	if v1 != nil {
+		putNat(v1)
+	}
+	putNat(qhatv)

 	q = q.norm()
 	shrVU(u, u, shift)
--- a/src/cmd/compile/internal/big/nat_test.go
+++ b/src/cmd/compile/internal/big/nat_test.go
@@ -5,6 +5,7 @@
 package big

 import (
+	"fmt"
 	"runtime"
 	"strings"
 	"testing"
@@ -509,24 +510,20 @@ func TestExpNN(t *testing.T) {
 	}
 }

-func ExpHelper(b *testing.B, x, y Word) {
-	var z nat
-	for i := 0; i < b.N; i++ {
-		z.expWW(x, y)
+func BenchmarkExp3Power(b *testing.B) {
+	const x = 3
+	for _, y := range []Word{
+		0x10, 0x40, 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x40000, 0x100000, 0x400000,
+	} {
+		b.Run(fmt.Sprintf("%#x", y), func(b *testing.B) {
+			var z nat
+			for i := 0; i < b.N; i++ {
+				z.expWW(x, y)
+			}
+		})
 	}
 }

-func BenchmarkExp3Power0x10(b *testing.B)     { ExpHelper(b, 3, 0x10) }
-func BenchmarkExp3Power0x40(b *testing.B)     { ExpHelper(b, 3, 0x40) }
-func BenchmarkExp3Power0x100(b *testing.B)    { ExpHelper(b, 3, 0x100) }
-func BenchmarkExp3Power0x400(b *testing.B)    { ExpHelper(b, 3, 0x400) }
-func BenchmarkExp3Power0x1000(b *testing.B)   { ExpHelper(b, 3, 0x1000) }
-func BenchmarkExp3Power0x4000(b *testing.B)   { ExpHelper(b, 3, 0x4000) }
-func BenchmarkExp3Power0x10000(b *testing.B)  { ExpHelper(b, 3, 0x10000) }
-func BenchmarkExp3Power0x40000(b *testing.B)  { ExpHelper(b, 3, 0x40000) }
-func BenchmarkExp3Power0x100000(b *testing.B) { ExpHelper(b, 3, 0x100000) }
-func BenchmarkExp3Power0x400000(b *testing.B) { ExpHelper(b, 3, 0x400000) }
-
 func fibo(n int) nat {
 	switch n {
 	case 0:
--- a/src/cmd/compile/internal/big/natconv.go
+++ b/src/cmd/compile/internal/big/natconv.go
@@ -302,7 +302,7 @@ func (x nat) itoa(neg bool, base int) []byte {
 		}

 	} else {
-		bb, ndigits := maxPow(Word(b))
+		bb, ndigits := maxPow(b)

 		// construct table of successive squares of bb*leafSize to use in subdivisions
 		// result (table != nil) <=> (len(x) > leafSize > 0)
@@ -391,7 +391,7 @@ func (q nat) convertWords(s []byte, b Word, ndigits int, bb Word, table []diviso
 				// this appears to be faster for BenchmarkString10000Base10
 				// and smaller strings (but a bit slower for larger ones)
 				t := r / 10
-				s[i] = '0' + byte(r-t<<3-t-t) // TODO(gri) replace w/ t*10 once compiler produces better code
+				s[i] = '0' + byte(r-t*10)
 				r = t
 			}
 		}
--- a/src/cmd/compile/internal/big/natconv_test.go
+++ b/src/cmd/compile/internal/big/natconv_test.go
@@ -6,6 +6,7 @@ package big

 import (
 	"bytes"
+	"fmt"
 	"io"
 	"strings"
 	"testing"
@@ -273,101 +274,57 @@ func BenchmarkStringPiParallel(b *testing.B) {
 	})
 }

-func BenchmarkScan10Base2(b *testing.B)     { ScanHelper(b, 2, 10, 10) }
-func BenchmarkScan100Base2(b *testing.B)    { ScanHelper(b, 2, 10, 100) }
-func BenchmarkScan1000Base2(b *testing.B)   { ScanHelper(b, 2, 10, 1000) }
-func BenchmarkScan10000Base2(b *testing.B)  { ScanHelper(b, 2, 10, 10000) }
-func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }
+func BenchmarkScan(b *testing.B) {
+	const x = 10
+	for _, base := range []int{2, 8, 10, 16} {
+		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
+				b.StopTimer()
+				var z nat
+				z = z.expWW(x, y)

-func BenchmarkScan10Base8(b *testing.B)     { ScanHelper(b, 8, 10, 10) }
-func BenchmarkScan100Base8(b *testing.B)    { ScanHelper(b, 8, 10, 100) }
-func BenchmarkScan1000Base8(b *testing.B)   { ScanHelper(b, 8, 10, 1000) }
-func BenchmarkScan10000Base8(b *testing.B)  { ScanHelper(b, 8, 10, 10000) }
-func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }
+				s := z.utoa(base)
+				if t := itoa(z, base); !bytes.Equal(s, t) {
+					b.Fatalf("scanning: got %s; want %s", s, t)
+				}
+				b.StartTimer()

-func BenchmarkScan10Base10(b *testing.B)     { ScanHelper(b, 10, 10, 10) }
-func BenchmarkScan100Base10(b *testing.B)    { ScanHelper(b, 10, 10, 100) }
-func BenchmarkScan1000Base10(b *testing.B)   { ScanHelper(b, 10, 10, 1000) }
-func BenchmarkScan10000Base10(b *testing.B)  { ScanHelper(b, 10, 10, 10000) }
-func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }
-
-func BenchmarkScan10Base16(b *testing.B)     { ScanHelper(b, 16, 10, 10) }
-func BenchmarkScan100Base16(b *testing.B)    { ScanHelper(b, 16, 10, 100) }
-func BenchmarkScan1000Base16(b *testing.B)   { ScanHelper(b, 16, 10, 1000) }
-func BenchmarkScan10000Base16(b *testing.B)  { ScanHelper(b, 16, 10, 10000) }
-func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }
-
-func ScanHelper(b *testing.B, base int, x, y Word) {
-	b.StopTimer()
-	var z nat
-	z = z.expWW(x, y)
-
-	s := z.utoa(base)
-	if t := itoa(z, base); !bytes.Equal(s, t) {
-		b.Fatalf("scanning: got %s; want %s", s, t)
-	}
-	b.StartTimer()
-
-	for i := 0; i < b.N; i++ {
-		z.scan(bytes.NewReader(s), base, false)
+				for i := 0; i < b.N; i++ {
+					z.scan(bytes.NewReader(s), base, false)
+				}
+			})
+		}
 	}
 }

-func BenchmarkString10Base2(b *testing.B)     { StringHelper(b, 2, 10, 10) }
-func BenchmarkString100Base2(b *testing.B)    { StringHelper(b, 2, 10, 100) }
-func BenchmarkString1000Base2(b *testing.B)   { StringHelper(b, 2, 10, 1000) }
-func BenchmarkString10000Base2(b *testing.B)  { StringHelper(b, 2, 10, 10000) }
-func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }
+func BenchmarkString(b *testing.B) {
+	const x = 10
+	for _, base := range []int{2, 8, 10, 16} {
+		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
+				b.StopTimer()
+				var z nat
+				z = z.expWW(x, y)
+				z.utoa(base) // warm divisor cache
+				b.StartTimer()

-func BenchmarkString10Base8(b *testing.B)     { StringHelper(b, 8, 10, 10) }
-func BenchmarkString100Base8(b *testing.B)    { StringHelper(b, 8, 10, 100) }
-func BenchmarkString1000Base8(b *testing.B)   { StringHelper(b, 8, 10, 1000) }
-func BenchmarkString10000Base8(b *testing.B)  { StringHelper(b, 8, 10, 10000) }
-func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }
-
-func BenchmarkString10Base10(b *testing.B)     { StringHelper(b, 10, 10, 10) }
-func BenchmarkString100Base10(b *testing.B)    { StringHelper(b, 10, 10, 100) }
-func BenchmarkString1000Base10(b *testing.B)   { StringHelper(b, 10, 10, 1000) }
-func BenchmarkString10000Base10(b *testing.B)  { StringHelper(b, 10, 10, 10000) }
-func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }
-
-func BenchmarkString10Base16(b *testing.B)     { StringHelper(b, 16, 10, 10) }
-func BenchmarkString100Base16(b *testing.B)    { StringHelper(b, 16, 10, 100) }
-func BenchmarkString1000Base16(b *testing.B)   { StringHelper(b, 16, 10, 1000) }
-func BenchmarkString10000Base16(b *testing.B)  { StringHelper(b, 16, 10, 10000) }
-func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }
-
-func StringHelper(b *testing.B, base int, x, y Word) {
-	b.StopTimer()
-	var z nat
-	z = z.expWW(x, y)
-	z.utoa(base) // warm divisor cache
-	b.StartTimer()
-
-	for i := 0; i < b.N; i++ {
-		_ = z.utoa(base)
+				for i := 0; i < b.N; i++ {
+					_ = z.utoa(base)
+				}
+			})
+		}
 	}
 }

-func BenchmarkLeafSize0(b *testing.B)  { LeafSizeHelper(b, 10, 0) } // test without splitting
-func BenchmarkLeafSize1(b *testing.B)  { LeafSizeHelper(b, 10, 1) }
-func BenchmarkLeafSize2(b *testing.B)  { LeafSizeHelper(b, 10, 2) }
-func BenchmarkLeafSize3(b *testing.B)  { LeafSizeHelper(b, 10, 3) }
-func BenchmarkLeafSize4(b *testing.B)  { LeafSizeHelper(b, 10, 4) }
-func BenchmarkLeafSize5(b *testing.B)  { LeafSizeHelper(b, 10, 5) }
-func BenchmarkLeafSize6(b *testing.B)  { LeafSizeHelper(b, 10, 6) }
-func BenchmarkLeafSize7(b *testing.B)  { LeafSizeHelper(b, 10, 7) }
-func BenchmarkLeafSize8(b *testing.B)  { LeafSizeHelper(b, 10, 8) }
-func BenchmarkLeafSize9(b *testing.B)  { LeafSizeHelper(b, 10, 9) }
-func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
-func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
-func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
-func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
-func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
-func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
-func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
-func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths
-func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
+func BenchmarkLeafSize(b *testing.B) {
+	for n := 0; n <= 16; n++ {
+		b.Run(fmt.Sprint(n), func(b *testing.B) { LeafSizeHelper(b, 10, n) })
+	}
+	// Try some large lengths
+	for _, n := range []int{32, 64} {
+		b.Run(fmt.Sprint(n), func(b *testing.B) { LeafSizeHelper(b, 10, n) })
+	}
+}

 func LeafSizeHelper(b *testing.B, base, size int) {
 	b.StopTimer()
--- a/src/cmd/compile/internal/big/ratconv.go
+++ b/src/cmd/compile/internal/big/ratconv.go
@@ -88,6 +88,12 @@ func (z *Rat) SetString(s string) (*Rat, bool) {
 		return nil, false
 	}

+	// special-case 0 (see also issue #16176)
+	if len(z.a.abs) == 0 {
+		return z, true
+	}
+	// len(z.a.abs) > 0
+
 	// correct exponent
 	if ecorr < 0 {
 		exp += int64(ecorr)
@@ -178,7 +184,7 @@ func scanExponent(r io.ByteScanner, binExpOk bool) (exp int64, base int, err err
 			}
 			break // i > 0
 		}
-		digits = append(digits, byte(ch))
+		digits = append(digits, ch)
 	}
 	// i > 0 => we have at least one digit

--- a/src/cmd/compile/internal/big/ratconv_test.go
+++ b/src/cmd/compile/internal/big/ratconv_test.go
@@ -48,6 +48,7 @@ var setStringTests = []StringTest{
 	{"53/70893980658822810696", "53/70893980658822810696", true},
 	{"106/141787961317645621392", "53/70893980658822810696", true},
 	{"204211327800791583.81095", "4084226556015831676219/20000", true},
+	{"0e9999999999", "0", true}, // issue #16176
 	{in: "1/0"},
 }

--- a/src/cmd/compile/internal/gc/bexport.go
+++ b/src/cmd/compile/internal/gc/bexport.go
@@ -197,6 +197,9 @@ type exporter struct {
 	written int // bytes written
 	indent  int // for p.trace
 	trace   bool
+
+	// work-around for issue #16369 only
+	nesting int // amount of "nesting" of interface types
 }

 // export writes the exportlist for localpkg to out and returns the number of bytes written.
@@ -790,11 +793,39 @@ func (p *exporter) typ(t *Type) {

 	case TINTER:
 		p.tag(interfaceTag)
-
 		// gc doesn't separate between embedded interfaces
 		// and methods declared explicitly with an interface
 		p.int(0) // no embedded interfaces
+
+		// Because the compiler flattens interfaces containing
+		// embedded interfaces, it is possible to create interface
+		// types that recur through an unnamed type.
+		// If trackAllTypes is disabled, such recursion is not
+		// detected, leading to a stack overflow during export
+		// (issue #16369).
+		// As a crude work-around we terminate deep recursion
+		// through interface types with an empty interface and
+		// report an error.
+		// This will catch endless recursion, but is unlikely
+		// to trigger for valid, deeply nested types given the
+		// high threshold.
+		// It would be ok to continue without reporting an error
+		// since the export format is valid. But a subsequent
+		// import would import an incorrect type. The textual
+		// exporter does not report an error but importing the
+		// resulting package will lead to a syntax error during
+		// import.
+		// TODO(gri) remove this once we have a permanent fix
+		// for the issue.
+		if p.nesting > 100 {
+			p.int(0) // 0 methods to indicate empty interface
+			yyerrorl(t.Lineno, "cannot export unnamed recursive interface")
+			break
+		}
+
+		p.nesting++
 		p.methodList(t)
+		p.nesting--

 	case TMAP:
 		p.tag(mapTag)
--- a/src/cmd/compile/internal/gc/builtin.go
+++ b/src/cmd/compile/internal/gc/builtin.go
@@ -95,14 +95,15 @@ const runtimeimport = "" +
 	"4div\x00\x03\n\x00\n\x00\x01\n\x00\t\x11uint64div\x00\x03\x14\x00\x14\x00\x01\x14\x00\t\x0fint64" +
 	"mod\x00\x03\n\x00\n\x00\x01\n\x00\t\x11uint64mod\x00\x03\x14\x00\x14\x00\x01\x14\x00\t\x1bfloat6" +
 	"4toint64\x00\x01\x1a\x00\x01\n\x00\t\x1dfloat64touint64\x00\x01\x1a\x00\x01\x14\x00\t" +
-	"\x1bint64tofloat64\x00\x01\n\x00\x01\x1a\x00\t\x1duint64tofloat64\x00" +
-	"\x01\x14\x00\x01\x1a\x00\t\x19complex128div\x00\x04\x1e\vnum·2\x00\x00\x1e\vden·" +
-	"3\x00\x00\x02\x1e\vquo·1\x00\x00\t\x19racefuncenter\x00\x01\x16d\x00\t\x17race" +
-	"funcexit\x00\x00\x00\t\x0fraceread\x00\x01\x16d\x00\t\x11racewrite\x00\x01\x16" +
-	"d\x00\t\x19racereadrange\x00\x04\x16\raddr·1\x00d\x16\rsize·2\x00" +
-	"d\x00\t\x1bracewriterange\x00\x04\x16\x94\x03\x00d\x16\x96\x03\x00d\x00\t\x0fmsanrea" +
-	"d\x00\x04\x16\x94\x03\x00d\x16\x96\x03\x00d\x00\t\x11msanwrite\x00\x04\x16\x94\x03\x00d\x16\x96\x03\x00d\x00\v\xf4" +
-	"\x01\x02\v\x00\x01\x00\n$$\n"
+	"\x1dfloat64touint32\x00\x01\x1a\x00\x01\x12\x00\t\x1bint64tofloat64\x00" +
+	"\x01\n\x00\x01\x1a\x00\t\x1duint64tofloat64\x00\x01\x14\x00\x01\x1a\x00\t\x1duint32to" +
+	"float64\x00\x01\x12\x00\x01\x1a\x00\t\x19complex128div\x00\x04\x1e\vnum·2\x00" +
+	"\x00\x1e\vden·3\x00\x00\x02\x1e\vquo·1\x00\x00\t\x19racefuncenter\x00\x01\x16" +
+	"d\x00\t\x17racefuncexit\x00\x00\x00\t\x0fraceread\x00\x01\x16d\x00\t\x11race" +
+	"write\x00\x01\x16d\x00\t\x19racereadrange\x00\x04\x16\raddr·1\x00d\x16\r" +
+	"size·2\x00d\x00\t\x1bracewriterange\x00\x04\x16\x98\x03\x00d\x16\x9a\x03\x00d\x00\t" +
+	"\x0fmsanread\x00\x04\x16\x98\x03\x00d\x16\x9a\x03\x00d\x00\t\x11msanwrite\x00\x04\x16\x98\x03\x00d" +
+	"\x16\x9a\x03\x00d\x00\v\xf8\x01\x02\v\x00\x01\x00\n$$\n"

 const unsafeimport = "" +
 	"cn\x00\x03v1\x01\vunsafe\x00\x05\r\rPointer\x00\x16\x00\t\x0fOffsetof\x00\x01" +
--- a/src/cmd/compile/internal/gc/builtin/runtime.go
+++ b/src/cmd/compile/internal/gc/builtin/runtime.go
@@ -150,8 +150,10 @@ func int64mod(int64, int64) int64
 func uint64mod(uint64, uint64) uint64
 func float64toint64(float64) int64
 func float64touint64(float64) uint64
+func float64touint32(float64) uint32
 func int64tofloat64(int64) float64
 func uint64tofloat64(uint64) float64
+func uint32tofloat64(uint32) float64

 func complex128div(num complex128, den complex128) (quo complex128)

--- a/src/cmd/compile/internal/gc/const.go
+++ b/src/cmd/compile/internal/gc/const.go
@@ -61,6 +61,34 @@ func (v Val) Ctype() Ctype {
 	}
 }

+func eqval(a, b Val) bool {
+	if a.Ctype() != b.Ctype() {
+		return false
+	}
+	switch x := a.U.(type) {
+	default:
+		Fatalf("unexpected Ctype for %T", a.U)
+		panic("not reached")
+	case *NilVal:
+		return true
+	case bool:
+		y := b.U.(bool)
+		return x == y
+	case *Mpint:
+		y := b.U.(*Mpint)
+		return x.Cmp(y) == 0
+	case *Mpflt:
+		y := b.U.(*Mpflt)
+		return x.Cmp(y) == 0
+	case *Mpcplx:
+		y := b.U.(*Mpcplx)
+		return x.Real.Cmp(&y.Real) == 0 && x.Imag.Cmp(&y.Imag) == 0
+	case string:
+		y := b.U.(string)
+		return x == y
+	}
+}
+
 type NilVal struct{}

 // IntLiteral returns the Node's literal value as an integer.
--- a/src/cmd/compile/internal/gc/constFold_test.go
+++ b/src/cmd/compile/internal/gc/constFold_test.go
--- a/src/cmd/compile/internal/gc/float_test.go
+++ b/src/cmd/compile/internal/gc/float_test.go
@@ -74,6 +74,27 @@ func cvt8(a float32) int32 {
 	return int32(a)
 }

+// make sure to cover int, uint cases (issue #16738)
+//go:noinline
+func cvt9(a float64) int {
+	return int(a)
+}
+
+//go:noinline
+func cvt10(a float64) uint {
+	return uint(a)
+}
+
+//go:noinline
+func cvt11(a float32) int {
+	return int(a)
+}
+
+//go:noinline
+func cvt12(a float32) uint {
+	return uint(a)
+}
+
 func TestFloatConvert(t *testing.T) {
 	if got := cvt1(3.5); got != 3 {
 		t.Errorf("cvt1 got %d, wanted 3", got)
@@ -99,4 +120,16 @@ func TestFloatConvert(t *testing.T) {
 	if got := cvt8(3.5); got != 3 {
 		t.Errorf("cvt8 got %d, wanted 3", got)
 	}
+	if got := cvt9(3.5); got != 3 {
+		t.Errorf("cvt9 got %d, wanted 3", got)
+	}
+	if got := cvt10(3.5); got != 3 {
+		t.Errorf("cvt10 got %d, wanted 3", got)
+	}
+	if got := cvt11(3.5); got != 3 {
+		t.Errorf("cvt11 got %d, wanted 3", got)
+	}
+	if got := cvt12(3.5); got != 3 {
+		t.Errorf("cvt12 got %d, wanted 3", got)
+	}
 }
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@@ -156,8 +156,6 @@ var Debug_typeassert int

 var localpkg *Pkg // package being compiled

-var autopkg *Pkg // fake package for allocating auto variables
-
 var importpkg *Pkg // package being imported

 var itabpkg *Pkg // fake pkg for itab entries
--- a/src/cmd/compile/internal/gc/inl.go
+++ b/src/cmd/compile/internal/gc/inl.go
@@ -766,7 +766,9 @@ func mkinlcall1(n *Node, fn *Node, isddd bool) *Node {
 		ninit.Append(as)
 	}

-	retlabel := newlabel_inl()
+	retlabel := autolabel(".i")
+	retlabel.Etype = 1 // flag 'safe' for escape analysis (no backjumps)
+
 	inlgen++

 	subst := inlsubst{
@@ -876,15 +878,6 @@ func argvar(t *Type, i int) *Node {
 	return n
 }

-var newlabel_inl_label int
-
-func newlabel_inl() *Node {
-	newlabel_inl_label++
-	n := newname(LookupN(".inlret", newlabel_inl_label))
-	n.Etype = 1 // flag 'safe' for escape analysis (no backjumps)
-	return n
-}
-
 // The inlsubst type implements the actual inlining of a single
 // function call.
 type inlsubst struct {
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@@ -29,6 +29,8 @@ var (
 	goarch  string
 	goroot  string
 	buildid string
+
+	flag_newparser bool
 )

 var (
@@ -108,8 +110,6 @@ func Main() {

 	localpkg = mkpkg("")
 	localpkg.Prefix = "\"\""
-	autopkg = mkpkg("")
-	autopkg.Prefix = "\"\""

 	// pseudo-package, for scoping
 	builtinpkg = mkpkg("go.builtin")
@@ -184,6 +184,7 @@ func Main() {
 	obj.Flagcount("live", "debug liveness analysis", &debuglive)
 	obj.Flagcount("m", "print optimization decisions", &Debug['m'])
 	flag.BoolVar(&flag_msan, "msan", false, "build code compatible with C/C++ memory sanitizer")
+	flag.BoolVar(&flag_newparser, "newparser", false, "use new parser")
 	flag.BoolVar(&newexport, "newexport", true, "use new export format") // TODO(gri) remove eventually (issue 15323)
 	flag.BoolVar(&nolocalimports, "nolocalimports", false, "reject local (relative) imports")
 	flag.StringVar(&outfile, "o", "", "write output to `file`")
@@ -313,25 +314,14 @@ func Main() {
 		}

 		linehistpush(infile)
-
-		f, err := os.Open(infile)
-		if err != nil {
-			fmt.Printf("open %s: %v\n", infile, err)
-			errorexit()
-		}
-		bin := bufio.NewReader(f)
-
-		// Skip initial BOM if present.
-		if r, _, _ := bin.ReadRune(); r != BOM {
-			bin.UnreadRune()
-		}
-
 		block = 1
 		iota_ = -1000000
-
 		imported_unsafe = false
-
-		parse_file(bin)
+		if flag_newparser {
+			parseFile(infile)
+		} else {
+			oldParseFile(infile)
+		}
 		if nsyntaxerrors != 0 {
 			errorexit()
 		}
@@ -340,9 +330,7 @@ func Main() {
 		// for the line history to work, and which then has to be corrected elsewhere,
 		// just add a line here.
 		lexlineno++
-
 		linehistpop()
-		f.Close()
 	}

 	testdclstack()
--- a/src/cmd/compile/internal/gc/noder.go
+++ b/src/cmd/compile/internal/gc/noder.go
--- a/src/cmd/compile/internal/gc/parser.go
+++ b/src/cmd/compile/internal/gc/parser.go
@@ -15,6 +15,7 @@ package gc
 import (
 	"bufio"
 	"fmt"
+	"os"
 	"strconv"
 	"strings"
 )
@@ -26,8 +27,20 @@ func parse_import(bin *bufio.Reader, indent []byte) {
 	newparser(bin, indent).import_package()
 }

-// parse_file parses a single Go source file.
-func parse_file(bin *bufio.Reader) {
+// oldParseFile parses a single Go source file.
+func oldParseFile(infile string) {
+	f, err := os.Open(infile)
+	if err != nil {
+		fmt.Printf("open %s: %v\n", infile, err)
+		errorexit()
+	}
+	defer f.Close()
+	bin := bufio.NewReader(f)
+
+	// Skip initial BOM if present.
+	if r, _, _ := bin.ReadRune(); r != BOM {
+		bin.UnreadRune()
+	}
 	newparser(bin, nil).file()
 }

--- a/src/cmd/compile/internal/gc/sinit.go
+++ b/src/cmd/compile/internal/gc/sinit.go
@@ -866,7 +866,7 @@ func maplit(ctxt int, n *Node, var_ *Node, init *Nodes) {
 	nerr := nerrors

 	a := Nod(OMAKE, nil, nil)
-	a.List.Set1(typenod(n.Type))
+	a.List.Set2(typenod(n.Type), Nodintconst(int64(len(n.List.Slice()))))
 	litas(var_, a, init)

 	// count the initializers
--- a/src/cmd/compile/internal/gc/sizeof_test.go
+++ b/src/cmd/compile/internal/gc/sizeof_test.go
@@ -23,7 +23,7 @@ func TestSizeof(t *testing.T) {
 		_64bit uintptr     // size on 64bit platforms
 	}{
 		{Flow{}, 52, 88},
-		{Func{}, 92, 160},
+		{Func{}, 96, 168},
 		{Name{}, 52, 80},
 		{Node{}, 92, 144},
 		{Sym{}, 60, 112},
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -26,6 +26,9 @@ func initssa() *ssa.Config {
 	ssaExp.mustImplement = true
 	if ssaConfig == nil {
 		ssaConfig = ssa.NewConfig(Thearch.LinkArch.Name, &ssaExp, Ctxt, Debug['N'] == 0)
+		if Thearch.LinkArch.Name == "386" {
+			ssaConfig.Set387(Thearch.Use387)
+		}
 	}
 	return ssaConfig
 }
@@ -37,8 +40,8 @@ func shouldssa(fn *Node) bool {
 		if os.Getenv("SSATEST") == "" {
 			return false
 		}
+	case "amd64", "amd64p32", "arm", "386", "arm64":
 		// Generally available.
-	case "amd64":
 	}
 	if !ssaEnabled {
 		return false
@@ -1146,6 +1149,7 @@ var opToSSA = map[opAndType]ssa.Op{
 	opAndType{OEQ, TFUNC}:      ssa.OpEqPtr,
 	opAndType{OEQ, TMAP}:       ssa.OpEqPtr,
 	opAndType{OEQ, TCHAN}:      ssa.OpEqPtr,
+	opAndType{OEQ, TPTR32}:     ssa.OpEqPtr,
 	opAndType{OEQ, TPTR64}:     ssa.OpEqPtr,
 	opAndType{OEQ, TUINTPTR}:   ssa.OpEqPtr,
 	opAndType{OEQ, TUNSAFEPTR}: ssa.OpEqPtr,
@@ -1166,6 +1170,7 @@ var opToSSA = map[opAndType]ssa.Op{
 	opAndType{ONE, TFUNC}:      ssa.OpNeqPtr,
 	opAndType{ONE, TMAP}:       ssa.OpNeqPtr,
 	opAndType{ONE, TCHAN}:      ssa.OpNeqPtr,
+	opAndType{ONE, TPTR32}:     ssa.OpNeqPtr,
 	opAndType{ONE, TPTR64}:     ssa.OpNeqPtr,
 	opAndType{ONE, TUINTPTR}:   ssa.OpNeqPtr,
 	opAndType{ONE, TUNSAFEPTR}: ssa.OpNeqPtr,
@@ -1330,6 +1335,15 @@ var fpConvOpToSSA = map[twoTypes]twoOpsAndType{
 	twoTypes{TFLOAT32, TFLOAT64}: twoOpsAndType{ssa.OpCvt32Fto64F, ssa.OpCopy, TFLOAT64},
 }

+// this map is used only for 32-bit arch, and only includes the difference
+// on 32-bit arch, don't use int64<->float conversion for uint32
+var fpConvOpToSSA32 = map[twoTypes]twoOpsAndType{
+	twoTypes{TUINT32, TFLOAT32}: twoOpsAndType{ssa.OpCopy, ssa.OpCvt32Uto32F, TUINT32},
+	twoTypes{TUINT32, TFLOAT64}: twoOpsAndType{ssa.OpCopy, ssa.OpCvt32Uto64F, TUINT32},
+	twoTypes{TFLOAT32, TUINT32}: twoOpsAndType{ssa.OpCvt32Fto32U, ssa.OpCopy, TUINT32},
+	twoTypes{TFLOAT64, TUINT32}: twoOpsAndType{ssa.OpCvt64Fto32U, ssa.OpCopy, TUINT32},
+}
+
 var shiftOpToSSA = map[opAndTwoTypes]ssa.Op{
 	opAndTwoTypes{OLSH, TINT8, TUINT8}:   ssa.OpLsh8x8,
 	opAndTwoTypes{OLSH, TUINT8, TUINT8}:  ssa.OpLsh8x8,
@@ -1646,6 +1660,11 @@ func (s *state) expr(n *Node) *ssa.Value {

 		if ft.IsFloat() || tt.IsFloat() {
 			conv, ok := fpConvOpToSSA[twoTypes{s.concreteEtype(ft), s.concreteEtype(tt)}]
+			if s.config.IntSize == 4 && Thearch.LinkArch.Name != "amd64p32" {
+				if conv1, ok1 := fpConvOpToSSA32[twoTypes{s.concreteEtype(ft), s.concreteEtype(tt)}]; ok1 {
+					conv = conv1
+				}
+			}
 			if !ok {
 				s.Fatalf("weird float conversion %s -> %s", ft, tt)
 			}
@@ -1954,7 +1973,7 @@ func (s *state) expr(n *Node) *ssa.Value {
 		case n.Left.Type.IsString():
 			a := s.expr(n.Left)
 			i := s.expr(n.Right)
-			i = s.extendIndex(i)
+			i = s.extendIndex(i, Panicindex)
 			if !n.Bounded {
 				len := s.newValue1(ssa.OpStringLen, Types[TINT], a)
 				s.boundsCheck(i, len)
@@ -2043,13 +2062,13 @@ func (s *state) expr(n *Node) *ssa.Value {
 		var i, j, k *ssa.Value
 		low, high, max := n.SliceBounds()
 		if low != nil {
-			i = s.extendIndex(s.expr(low))
+			i = s.extendIndex(s.expr(low), panicslice)
 		}
 		if high != nil {
-			j = s.extendIndex(s.expr(high))
+			j = s.extendIndex(s.expr(high), panicslice)
 		}
 		if max != nil {
-			k = s.extendIndex(s.expr(max))
+			k = s.extendIndex(s.expr(max), panicslice)
 		}
 		p, l, c := s.slice(n.Left.Type, v, i, j, k)
 		return s.newValue3(ssa.OpSliceMake, n.Type, p, l, c)
@@ -2059,10 +2078,10 @@ func (s *state) expr(n *Node) *ssa.Value {
 		var i, j *ssa.Value
 		low, high, _ := n.SliceBounds()
 		if low != nil {
-			i = s.extendIndex(s.expr(low))
+			i = s.extendIndex(s.expr(low), panicslice)
 		}
 		if high != nil {
-			j = s.extendIndex(s.expr(high))
+			j = s.extendIndex(s.expr(high), panicslice)
 		}
 		p, l, _ := s.slice(n.Left.Type, v, i, j, nil)
 		return s.newValue2(ssa.OpStringMake, n.Type, p, l)
@@ -2246,7 +2265,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
 			if haspointers(et) {
 				s.insertWBmove(et, addr, arg.v, n.Lineno, arg.isVolatile)
 			} else {
-				s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, et.Size(), addr, arg.v, s.mem())
+				s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(et), addr, arg.v, s.mem())
 			}
 		}
 	}
@@ -2379,14 +2398,14 @@ func (s *state) assign(left *Node, right *ssa.Value, wb, deref bool, line int32,
 	if deref {
 		// Treat as a mem->mem move.
 		if right == nil {
-			s.vars[&memVar] = s.newValue2I(ssa.OpZero, ssa.TypeMem, t.Size(), addr, s.mem())
+			s.vars[&memVar] = s.newValue2I(ssa.OpZero, ssa.TypeMem, SizeAlignAuxInt(t), addr, s.mem())
 			return
 		}
 		if wb {
 			s.insertWBmove(t, addr, right, line, rightIsVolatile)
 			return
 		}
-		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, t.Size(), addr, right, s.mem())
+		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(t), addr, right, s.mem())
 		return
 	}
 	// Treat as a store.
@@ -2585,7 +2604,7 @@ func (s *state) call(n *Node, k callKind) *ssa.Value {
 			s.nilCheck(itab)
 		}
 		itabidx := fn.Xoffset + 3*int64(Widthptr) + 8 // offset of fun field in runtime.itab
-		itab = s.newValue1I(ssa.OpOffPtr, Types[TUINTPTR], itabidx, itab)
+		itab = s.newValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), itabidx, itab)
 		if k == callNormal {
 			codeptr = s.newValue2(ssa.OpLoad, Types[TUINTPTR], itab, s.mem())
 		} else {
@@ -2608,16 +2627,18 @@ func (s *state) call(n *Node, k callKind) *ssa.Value {
 		if k != callNormal {
 			argStart += int64(2 * Widthptr)
 		}
-		addr := s.entryNewValue1I(ssa.OpOffPtr, Types[TUINTPTR], argStart, s.sp)
+		addr := s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), argStart, s.sp)
 		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, int64(Widthptr), addr, rcvr, s.mem())
 	}

 	// Defer/go args
 	if k != callNormal {
 		// Write argsize and closure (args to Newproc/Deferproc).
+		argStart := Ctxt.FixedFrameSize()
 		argsize := s.constInt32(Types[TUINT32], int32(stksize))
-		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, 4, s.sp, argsize, s.mem())
-		addr := s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), int64(Widthptr), s.sp)
+		addr := s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINT32]), argStart, s.sp)
+		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, 4, addr, argsize, s.mem())
+		addr = s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), argStart+int64(Widthptr), s.sp)
 		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, int64(Widthptr), addr, closure, s.mem())
 		stksize += 2 * int64(Widthptr)
 	}
@@ -2762,7 +2783,7 @@ func (s *state) addr(n *Node, bounded bool) (*ssa.Value, bool) {
 		if n.Left.Type.IsSlice() {
 			a := s.expr(n.Left)
 			i := s.expr(n.Right)
-			i = s.extendIndex(i)
+			i = s.extendIndex(i, Panicindex)
 			len := s.newValue1(ssa.OpSliceLen, Types[TINT], a)
 			if !n.Bounded {
 				s.boundsCheck(i, len)
@@ -2772,7 +2793,7 @@ func (s *state) addr(n *Node, bounded bool) (*ssa.Value, bool) {
 		} else { // array
 			a, isVolatile := s.addr(n.Left, bounded)
 			i := s.expr(n.Right)
-			i = s.extendIndex(i)
+			i = s.extendIndex(i, Panicindex)
 			len := s.constInt(Types[TINT], n.Left.Type.NumElem())
 			if !n.Bounded {
 				s.boundsCheck(i, len)
@@ -2913,12 +2934,11 @@ func (s *state) nilCheck(ptr *ssa.Value) {

 // boundsCheck generates bounds checking code. Checks if 0 <= idx < len, branches to exit if not.
 // Starts a new block on return.
+// idx is already converted to full int width.
 func (s *state) boundsCheck(idx, len *ssa.Value) {
 	if Debug['B'] != 0 {
 		return
 	}
-	// TODO: convert index to full width?
-	// TODO: if index is 64-bit and we're compiling to 32-bit, check that high 32 bits are zero.

 	// bounds check
 	cmp := s.newValue2(ssa.OpIsInBounds, Types[TBOOL], idx, len)
@@ -2927,19 +2947,18 @@ func (s *state) boundsCheck(idx, len *ssa.Value) {

 // sliceBoundsCheck generates slice bounds checking code. Checks if 0 <= idx <= len, branches to exit if not.
 // Starts a new block on return.
+// idx and len are already converted to full int width.
 func (s *state) sliceBoundsCheck(idx, len *ssa.Value) {
 	if Debug['B'] != 0 {
 		return
 	}
-	// TODO: convert index to full width?
-	// TODO: if index is 64-bit and we're compiling to 32-bit, check that high 32 bits are zero.

 	// bounds check
 	cmp := s.newValue2(ssa.OpIsSliceInBounds, Types[TBOOL], idx, len)
 	s.check(cmp, panicslice)
 }

-// If cmp (a bool) is true, panic using the given function.
+// If cmp (a bool) is false, panic using the given function.
 func (s *state) check(cmp *ssa.Value, fn *Node) {
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
@@ -2969,19 +2988,23 @@ func (s *state) check(cmp *ssa.Value, fn *Node) {
 // is started to load the return values.
 func (s *state) rtcall(fn *Node, returns bool, results []*Type, args ...*ssa.Value) []*ssa.Value {
 	// Write args to the stack
-	var off int64 // TODO: arch-dependent starting offset?
+	off := Ctxt.FixedFrameSize()
 	for _, arg := range args {
 		t := arg.Type
 		off = Rnd(off, t.Alignment())
 		ptr := s.sp
 		if off != 0 {
-			ptr = s.newValue1I(ssa.OpOffPtr, Types[TUINTPTR], off, s.sp)
+			ptr = s.newValue1I(ssa.OpOffPtr, t.PtrTo(), off, s.sp)
 		}
 		size := t.Size()
 		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, size, ptr, arg, s.mem())
 		off += size
 	}
 	off = Rnd(off, int64(Widthptr))
+	if Thearch.LinkArch.Name == "amd64p32" {
+		// amd64p32 wants 8-byte alignment of the start of the return values.
+		off = Rnd(off, 8)
+	}

 	// Issue call
 	call := s.newValue1A(ssa.OpStaticCall, ssa.TypeMem, fn.Sym, s.mem())
@@ -2992,7 +3015,7 @@ func (s *state) rtcall(fn *Node, returns bool, results []*Type, args ...*ssa.Val
 	if !returns {
 		b.Kind = ssa.BlockExit
 		b.SetControl(call)
-		call.AuxInt = off
+		call.AuxInt = off - Ctxt.FixedFrameSize()
 		if len(results) > 0 {
 			Fatalf("panic call can't have results")
 		}
@@ -3015,7 +3038,7 @@ func (s *state) rtcall(fn *Node, returns bool, results []*Type, args ...*ssa.Val
 		off = Rnd(off, t.Alignment())
 		ptr := s.sp
 		if off != 0 {
-			ptr = s.newValue1I(ssa.OpOffPtr, Types[TUINTPTR], off, s.sp)
+			ptr = s.newValue1I(ssa.OpOffPtr, Ptrto(t), off, s.sp)
 		}
 		res[i] = s.newValue2(ssa.OpLoad, t, ptr, s.mem())
 		off += t.Size()
@@ -3049,10 +3072,9 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, line int32, rightI

 	aux := &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: syslook("writeBarrier").Sym}
 	flagaddr := s.newValue1A(ssa.OpAddr, Ptrto(Types[TUINT32]), aux, s.sb)
-	// TODO: select the .enabled field. It is currently first, so not needed for now.
-	// Load word, test byte, avoiding partial register write from load byte.
+	// Load word, test word, avoiding partial register write from load byte.
 	flag := s.newValue2(ssa.OpLoad, Types[TUINT32], flagaddr, s.mem())
-	flag = s.newValue1(ssa.OpTrunc64to8, Types[TBOOL], flag)
+	flag = s.newValue2(ssa.OpNeq32, Types[TBOOL], flag, s.constInt32(Types[TUINT32], 0))
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
 	b.Likely = ssa.BranchUnlikely
@@ -3073,7 +3095,7 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, line int32, rightI
 		tmp := temp(t)
 		s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, ssa.TypeMem, tmp, s.mem())
 		tmpaddr, _ := s.addr(tmp, true)
-		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, t.Size(), tmpaddr, right, s.mem())
+		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(t), tmpaddr, right, s.mem())
 		// Issue typedmemmove call.
 		taddr := s.newValue1A(ssa.OpAddr, Types[TUINTPTR], &ssa.ExternSymbol{Typ: Types[TUINTPTR], Sym: typenamesym(t)}, s.sb)
 		s.rtcall(typedmemmove, true, nil, taddr, left, tmpaddr)
@@ -3083,7 +3105,7 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, line int32, rightI
 	s.endBlock().AddEdgeTo(bEnd)

 	s.startBlock(bElse)
-	s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, t.Size(), left, right, s.mem())
+	s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(t), left, right, s.mem())
 	s.endBlock().AddEdgeTo(bEnd)

 	s.startBlock(bEnd)
@@ -3117,10 +3139,9 @@ func (s *state) insertWBstore(t *Type, left, right *ssa.Value, line int32, skip

 	aux := &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: syslook("writeBarrier").Sym}
 	flagaddr := s.newValue1A(ssa.OpAddr, Ptrto(Types[TUINT32]), aux, s.sb)
-	// TODO: select the .enabled field. It is currently first, so not needed for now.
-	// Load word, test byte, avoiding partial register write from load byte.
+	// Load word, test word, avoiding partial register write from load byte.
 	flag := s.newValue2(ssa.OpLoad, Types[TUINT32], flagaddr, s.mem())
-	flag = s.newValue1(ssa.OpTrunc64to8, Types[TBOOL], flag)
+	flag = s.newValue2(ssa.OpNeq32, Types[TBOOL], flag, s.constInt32(Types[TUINT32], 0))
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
 	b.Likely = ssa.BranchUnlikely
@@ -3930,6 +3951,11 @@ type SSAGenState struct {

 	// bstart remembers where each block starts (indexed by block ID)
 	bstart []*obj.Prog
+
+	// 387 port: maps from SSE registers (REG_X?) to 387 registers (REG_F?)
+	SSEto387 map[int16]int16
+	// Some architectures require a 64-bit temporary for FP-related register shuffling. Examples include x86-387, PPC, and Sparc V8.
+	ScratchFpMem *Node
 }

 // Pc returns the current Prog.
@@ -3966,6 +3992,13 @@ func genssa(f *ssa.Func, ptxt *obj.Prog, gcargs, gclocals *Sym) {
 		blockProgs[Pc] = f.Blocks[0]
 	}

+	if Thearch.Use387 {
+		s.SSEto387 = map[int16]int16{}
+	}
+	if f.Config.NeedsFpScratch {
+		s.ScratchFpMem = temp(Types[TUINT64])
+	}
+
 	// Emit basic blocks
 	for i, b := range f.Blocks {
 		s.bstart[b.ID] = Pc
@@ -4128,12 +4161,25 @@ func SSAGenFPJump(s *SSAGenState, b, next *ssa.Block, jumps *[2][2]FloatingEQNEJ
 	}
 }

+func AuxOffset(v *ssa.Value) (offset int64) {
+	if v.Aux == nil {
+		return 0
+	}
+	switch sym := v.Aux.(type) {
+
+	case *ssa.AutoSymbol:
+		n := sym.Node.(*Node)
+		return n.Xoffset
+	}
+	return 0
+}
+
 // AddAux adds the offset in the aux fields (AuxInt and Aux) of v to a.
 func AddAux(a *obj.Addr, v *ssa.Value) {
 	AddAux2(a, v, v.AuxInt)
 }
 func AddAux2(a *obj.Addr, v *ssa.Value, offset int64) {
-	if a.Type != obj.TYPE_MEM {
+	if a.Type != obj.TYPE_MEM && a.Type != obj.TYPE_ADDR {
 		v.Fatalf("bad AddAux addr %v", a)
 	}
 	// add integer offset
@@ -4171,17 +4217,27 @@ func AddAux2(a *obj.Addr, v *ssa.Value, offset int64) {
 	}
 }

+// SizeAlignAuxInt returns an AuxInt encoding the size and alignment of type t.
+func SizeAlignAuxInt(t *Type) int64 {
+	return ssa.MakeSizeAndAlign(t.Size(), t.Alignment()).Int64()
+}
+
 // extendIndex extends v to a full int width.
-func (s *state) extendIndex(v *ssa.Value) *ssa.Value {
+// panic using the given function if v does not fit in an int (only on 32-bit archs).
+func (s *state) extendIndex(v *ssa.Value, panicfn *Node) *ssa.Value {
 	size := v.Type.Size()
 	if size == s.config.IntSize {
 		return v
 	}
 	if size > s.config.IntSize {
-		// TODO: truncate 64-bit indexes on 32-bit pointer archs. We'd need to test
-		// the high word and branch to out-of-bounds failure if it is not 0.
-		s.Unimplementedf("64->32 index truncation not implemented")
-		return v
+		// truncate 64-bit indexes on 32-bit pointer archs. Test the
+		// high word and branch to out-of-bounds failure if it is not 0.
+		if Debug['B'] == 0 {
+			hi := s.newValue1(ssa.OpInt64Hi, Types[TUINT32], v)
+			cmp := s.newValue2(ssa.OpEq32, Types[TBOOL], hi, s.constInt32(Types[TUINT32], 0))
+			s.check(cmp, panicfn)
+		}
+		return s.newValue1(ssa.OpTrunc64to32, Types[TINT], v)
 	}

 	// Extend value to the required size
@@ -4220,17 +4276,74 @@ func (s *state) extendIndex(v *ssa.Value) *ssa.Value {
 	return s.newValue1(op, Types[TINT], v)
 }

-// SSARegNum returns the register (in cmd/internal/obj numbering) to
-// which v has been allocated. Panics if v is not assigned to a
-// register.
-// TODO: Make this panic again once it stops happening routinely.
-func SSARegNum(v *ssa.Value) int16 {
+// SSAReg returns the register to which v has been allocated.
+func SSAReg(v *ssa.Value) *ssa.Register {
 	reg := v.Block.Func.RegAlloc[v.ID]
 	if reg == nil {
-		v.Unimplementedf("nil regnum for value: %s\n%s\n", v.LongString(), v.Block.Func)
-		return 0
+		v.Fatalf("nil register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSAReg0 returns the register to which the first output of v has been allocated.
+func SSAReg0(v *ssa.Value) *ssa.Register {
+	reg := v.Block.Func.RegAlloc[v.ID].(ssa.LocPair)[0]
+	if reg == nil {
+		v.Fatalf("nil first register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSAReg1 returns the register to which the second output of v has been allocated.
+func SSAReg1(v *ssa.Value) *ssa.Register {
+	reg := v.Block.Func.RegAlloc[v.ID].(ssa.LocPair)[1]
+	if reg == nil {
+		v.Fatalf("nil second register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSARegNum returns the register number (in cmd/internal/obj numbering) to which v has been allocated.
+func SSARegNum(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg(v).Num]
+}
+
+// SSARegNum0 returns the register number (in cmd/internal/obj numbering) to which the first output of v has been allocated.
+func SSARegNum0(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg0(v).Num]
+}
+
+// SSARegNum1 returns the register number (in cmd/internal/obj numbering) to which the second output of v has been allocated.
+func SSARegNum1(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg1(v).Num]
+}
+
+// CheckLoweredPhi checks that regalloc and stackalloc correctly handled phi values.
+// Called during ssaGenValue.
+func CheckLoweredPhi(v *ssa.Value) {
+	if v.Op != ssa.OpPhi {
+		v.Fatalf("CheckLoweredPhi called with non-phi value: %v", v.LongString())
+	}
+	if v.Type.IsMemory() {
+		return
+	}
+	f := v.Block.Func
+	loc := f.RegAlloc[v.ID]
+	for _, a := range v.Args {
+		if aloc := f.RegAlloc[a.ID]; aloc != loc { // TODO: .Equal() instead?
+			v.Fatalf("phi arg at different location than phi: %v @ %v, but arg %v @ %v\n%s\n", v, loc, a, aloc, v.Block.Func)
+		}
+	}
+}
+
+// CheckLoweredGetClosurePtr checks that v is the first instruction in the function's entry block.
+// The output of LoweredGetClosurePtr is generally hardwired to the correct register.
+// That register contains the closure pointer on closure entry.
+func CheckLoweredGetClosurePtr(v *ssa.Value) {
+	entry := v.Block.Func.Entry
+	if entry != v.Block || entry.Values[0] != v {
+		Fatalf("in %s, badly placed LoweredGetClosurePtr: %v %v", v.Block.Func.Name, v.Block, v)
 	}
-	return Thearch.SSARegToReg[reg.(*ssa.Register).Num]
 }

 // AutoVar returns a *Node and int64 representing the auto variable and offset within it
@@ -4372,6 +4485,25 @@ func (e *ssaExport) SplitComplex(name ssa.LocalSlot) (ssa.LocalSlot, ssa.LocalSl
 	return ssa.LocalSlot{N: n, Type: t, Off: name.Off}, ssa.LocalSlot{N: n, Type: t, Off: name.Off + s}
 }

+func (e *ssaExport) SplitInt64(name ssa.LocalSlot) (ssa.LocalSlot, ssa.LocalSlot) {
+	n := name.N.(*Node)
+	var t *Type
+	if name.Type.IsSigned() {
+		t = Types[TINT32]
+	} else {
+		t = Types[TUINT32]
+	}
+	if n.Class == PAUTO && !n.Addrtaken {
+		// Split this int64 up into two separate variables.
+		h := e.namedAuto(n.Sym.Name+".hi", t)
+		l := e.namedAuto(n.Sym.Name+".lo", Types[TUINT32])
+		return ssa.LocalSlot{N: h, Type: t, Off: 0}, ssa.LocalSlot{N: l, Type: Types[TUINT32], Off: 0}
+	}
+	// Return the two parts of the larger variable.
+	// Assuming little endian (we don't support big endian 32-bit architecture yet)
+	return ssa.LocalSlot{N: n, Type: t, Off: name.Off + 4}, ssa.LocalSlot{N: n, Type: Types[TUINT32], Off: name.Off}
+}
+
 func (e *ssaExport) SplitStruct(name ssa.LocalSlot, i int) ssa.LocalSlot {
 	n := name.N.(*Node)
 	st := name.Type
@@ -4389,7 +4521,7 @@ func (e *ssaExport) SplitStruct(name ssa.LocalSlot, i int) ssa.LocalSlot {
 // namedAuto returns a new AUTO variable with the given name and type.
 func (e *ssaExport) namedAuto(name string, typ ssa.Type) ssa.GCNode {
 	t := typ.(*Type)
-	s := &Sym{Name: name, Pkg: autopkg}
+	s := &Sym{Name: name, Pkg: localpkg}
 	n := Nod(ONAME, nil, nil)
 	s.Def = n
 	s.Def.Used = true
--- a/src/cmd/compile/internal/gc/subr.go
+++ b/src/cmd/compile/internal/gc/subr.go
@@ -246,6 +246,25 @@ func LookupN(prefix string, n int) *Sym {
 	return LookupBytes(b)
 }

+// autolabel generates a new Name node for use with
+// an automatically generated label.
+// prefix is a short mnemonic (e.g. ".s" for switch)
+// to help with debugging.
+// It should begin with "." to avoid conflicts with
+// user labels.
+func autolabel(prefix string) *Node {
+	if prefix[0] != '.' {
+		Fatalf("autolabel prefix must start with '.', have %q", prefix)
+	}
+	fn := Curfn
+	if Curfn == nil {
+		Fatalf("autolabel outside function")
+	}
+	n := fn.Func.Label
+	fn.Func.Label++
+	return newname(LookupN(prefix, int(n)))
+}
+
 var initSyms []*Sym

 var nopkg = &Pkg{
@@ -2298,6 +2317,16 @@ func isdirectiface(t *Type) bool {
 	return false
 }

+// itabType loads the _type field from a runtime.itab struct.
+func itabType(itab *Node) *Node {
+	typ := NodSym(ODOTPTR, itab, nil)
+	typ.Type = Ptrto(Types[TUINT8])
+	typ.Typecheck = 1
+	typ.Xoffset = int64(Widthptr) // offset of _type in runtime.itab
+	typ.Bounded = true            // guaranteed not to fault
+	return typ
+}
+
 // iet returns 'T' if t is a concrete type,
 // 'I' if t is an interface type, and 'E' if t is an empty interface type.
 // It is used to build calls to the conv* and assert* runtime routines.
--- a/src/cmd/compile/internal/gc/swt.go
+++ b/src/cmd/compile/internal/gc/swt.go
@@ -4,10 +4,7 @@

 package gc

-import (
-	"sort"
-	"strconv"
-)
+import "sort"

 const (
 	// expression switch
@@ -361,7 +358,7 @@ func casebody(sw *Node, typeswvar *Node) {
 		n.Op = OCASE
 		needvar := n.List.Len() != 1 || n.List.First().Op == OLITERAL

-		jmp := Nod(OGOTO, newCaseLabel(), nil)
+		jmp := Nod(OGOTO, autolabel(".s"), nil)
 		if n.List.Len() == 0 {
 			if def != nil {
 				Yyerror("more than one default case")
@@ -424,16 +421,6 @@ func casebody(sw *Node, typeswvar *Node) {
 	lineno = lno
 }

-// nSwitchLabel is the number of switch labels generated.
-// This should be per-function, but it is a global counter for now.
-var nSwitchLabel int
-
-func newCaseLabel() *Node {
-	label := strconv.Itoa(nSwitchLabel)
-	nSwitchLabel++
-	return newname(Lookup(label))
-}
-
 // caseClauses generates a slice of caseClauses
 // corresponding to the clauses in the switch statement sw.
 // Kind is the kind of switch statement.
@@ -590,7 +577,7 @@ func (s *typeSwitch) walk(sw *Node) {
 		i.Nbody.Set1(typenil)
 	} else {
 		// Jump to default case.
-		lbl := newCaseLabel()
+		lbl := autolabel(".s")
 		i.Nbody.Set1(Nod(OGOTO, lbl, nil))
 		// Wrap default case with label.
 		blk := Nod(OBLOCK, nil, nil)
@@ -602,11 +589,7 @@ func (s *typeSwitch) walk(sw *Node) {

 	if !cond.Right.Type.IsEmptyInterface() {
 		// Load type from itab.
-		typ = NodSym(ODOTPTR, typ, nil)
-		typ.Type = Ptrto(Types[TUINT8])
-		typ.Typecheck = 1
-		typ.Xoffset = int64(Widthptr) // offset of _type in runtime.itab
-		typ.Bounded = true            // guaranteed not to fault
+		typ = itabType(typ)
 	}
 	// Load hash from type.
 	h := NodSym(ODOTPTR, typ, nil)
--- a/src/cmd/compile/internal/gc/syntax.go
+++ b/src/cmd/compile/internal/gc/syntax.go
@@ -290,6 +290,8 @@ type Func struct {
 	InlCost int32
 	Depth   int32

+	Label int32 // largest auto-generated label in this function
+
 	Endlineno int32
 	WBLineno  int32 // line number of first write barrier

@@ -543,6 +545,11 @@ func (n *Nodes) Set1(node *Node) {
 	n.slice = &[]*Node{node}
 }

+// Set2 sets n to a slice containing two nodes.
+func (n *Nodes) Set2(n1, n2 *Node) {
+	n.slice = &[]*Node{n1, n2}
+}
+
 // MoveNodes sets n to the contents of n2, then clears n2.
 func (n *Nodes) MoveNodes(n2 *Nodes) {
 	n.slice = n2.slice
--- a/src/cmd/compile/internal/gc/testdata/arith_ssa.go
+++ b/src/cmd/compile/internal/gc/testdata/arith_ssa.go
@@ -553,6 +553,445 @@ func testOrPhi() {
 	}
 }

+//go:noinline
+func addshiftLL_ssa(a, b uint32) uint32 {
+	return a + b<<3
+}
+
+//go:noinline
+func subshiftLL_ssa(a, b uint32) uint32 {
+	return a - b<<3
+}
+
+//go:noinline
+func rsbshiftLL_ssa(a, b uint32) uint32 {
+	return a<<3 - b
+}
+
+//go:noinline
+func andshiftLL_ssa(a, b uint32) uint32 {
+	return a & (b << 3)
+}
+
+//go:noinline
+func orshiftLL_ssa(a, b uint32) uint32 {
+	return a | b<<3
+}
+
+//go:noinline
+func xorshiftLL_ssa(a, b uint32) uint32 {
+	return a ^ b<<3
+}
+
+//go:noinline
+func bicshiftLL_ssa(a, b uint32) uint32 {
+	return a &^ (b << 3)
+}
+
+//go:noinline
+func notshiftLL_ssa(a uint32) uint32 {
+	return ^(a << 3)
+}
+
+//go:noinline
+func addshiftRL_ssa(a, b uint32) uint32 {
+	return a + b>>3
+}
+
+//go:noinline
+func subshiftRL_ssa(a, b uint32) uint32 {
+	return a - b>>3
+}
+
+//go:noinline
+func rsbshiftRL_ssa(a, b uint32) uint32 {
+	return a>>3 - b
+}
+
+//go:noinline
+func andshiftRL_ssa(a, b uint32) uint32 {
+	return a & (b >> 3)
+}
+
+//go:noinline
+func orshiftRL_ssa(a, b uint32) uint32 {
+	return a | b>>3
+}
+
+//go:noinline
+func xorshiftRL_ssa(a, b uint32) uint32 {
+	return a ^ b>>3
+}
+
+//go:noinline
+func bicshiftRL_ssa(a, b uint32) uint32 {
+	return a &^ (b >> 3)
+}
+
+//go:noinline
+func notshiftRL_ssa(a uint32) uint32 {
+	return ^(a >> 3)
+}
+
+//go:noinline
+func addshiftRA_ssa(a, b int32) int32 {
+	return a + b>>3
+}
+
+//go:noinline
+func subshiftRA_ssa(a, b int32) int32 {
+	return a - b>>3
+}
+
+//go:noinline
+func rsbshiftRA_ssa(a, b int32) int32 {
+	return a>>3 - b
+}
+
+//go:noinline
+func andshiftRA_ssa(a, b int32) int32 {
+	return a & (b >> 3)
+}
+
+//go:noinline
+func orshiftRA_ssa(a, b int32) int32 {
+	return a | b>>3
+}
+
+//go:noinline
+func xorshiftRA_ssa(a, b int32) int32 {
+	return a ^ b>>3
+}
+
+//go:noinline
+func bicshiftRA_ssa(a, b int32) int32 {
+	return a &^ (b >> 3)
+}
+
+//go:noinline
+func notshiftRA_ssa(a int32) int32 {
+	return ^(a >> 3)
+}
+
+//go:noinline
+func addshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a + b<<s
+}
+
+//go:noinline
+func subshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a - b<<s
+}
+
+//go:noinline
+func rsbshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a<<s - b
+}
+
+//go:noinline
+func andshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a & (b << s)
+}
+
+//go:noinline
+func orshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a | b<<s
+}
+
+//go:noinline
+func xorshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a ^ b<<s
+}
+
+//go:noinline
+func bicshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a &^ (b << s)
+}
+
+//go:noinline
+func notshiftLLreg_ssa(a uint32, s uint8) uint32 {
+	return ^(a << s)
+}
+
+//go:noinline
+func addshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a + b>>s
+}
+
+//go:noinline
+func subshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a - b>>s
+}
+
+//go:noinline
+func rsbshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a>>s - b
+}
+
+//go:noinline
+func andshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a & (b >> s)
+}
+
+//go:noinline
+func orshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a | b>>s
+}
+
+//go:noinline
+func xorshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a ^ b>>s
+}
+
+//go:noinline
+func bicshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a &^ (b >> s)
+}
+
+//go:noinline
+func notshiftRLreg_ssa(a uint32, s uint8) uint32 {
+	return ^(a >> s)
+}
+
+//go:noinline
+func addshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a + b>>s
+}
+
+//go:noinline
+func subshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a - b>>s
+}
+
+//go:noinline
+func rsbshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a>>s - b
+}
+
+//go:noinline
+func andshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a & (b >> s)
+}
+
+//go:noinline
+func orshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a | b>>s
+}
+
+//go:noinline
+func xorshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a ^ b>>s
+}
+
+//go:noinline
+func bicshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a &^ (b >> s)
+}
+
+//go:noinline
+func notshiftRAreg_ssa(a int32, s uint8) int32 {
+	return ^(a >> s)
+}
+
+// test ARM shifted ops
+func testShiftedOps() {
+	a, b := uint32(10), uint32(42)
+	if want, got := a+b<<3, addshiftLL_ssa(a, b); got != want {
+		println("addshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b<<3, subshiftLL_ssa(a, b); got != want {
+		println("subshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a<<3-b, rsbshiftLL_ssa(a, b); got != want {
+		println("rsbshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b<<3), andshiftLL_ssa(a, b); got != want {
+		println("andshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b<<3, orshiftLL_ssa(a, b); got != want {
+		println("orshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b<<3, xorshiftLL_ssa(a, b); got != want {
+		println("xorshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b<<3), bicshiftLL_ssa(a, b); got != want {
+		println("bicshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a << 3), notshiftLL_ssa(a); got != want {
+		println("notshiftLL_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a+b>>3, addshiftRL_ssa(a, b); got != want {
+		println("addshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b>>3, subshiftRL_ssa(a, b); got != want {
+		println("subshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a>>3-b, rsbshiftRL_ssa(a, b); got != want {
+		println("rsbshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b>>3), andshiftRL_ssa(a, b); got != want {
+		println("andshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b>>3, orshiftRL_ssa(a, b); got != want {
+		println("orshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b>>3, xorshiftRL_ssa(a, b); got != want {
+		println("xorshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b>>3), bicshiftRL_ssa(a, b); got != want {
+		println("bicshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a >> 3), notshiftRL_ssa(a); got != want {
+		println("notshiftRL_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	c, d := int32(10), int32(-42)
+	if want, got := c+d>>3, addshiftRA_ssa(c, d); got != want {
+		println("addshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c-d>>3, subshiftRA_ssa(c, d); got != want {
+		println("subshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c>>3-d, rsbshiftRA_ssa(c, d); got != want {
+		println("rsbshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&(d>>3), andshiftRA_ssa(c, d); got != want {
+		println("andshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c|d>>3, orshiftRA_ssa(c, d); got != want {
+		println("orshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c^d>>3, xorshiftRA_ssa(c, d); got != want {
+		println("xorshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&^(d>>3), bicshiftRA_ssa(c, d); got != want {
+		println("bicshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(d >> 3), notshiftRA_ssa(d); got != want {
+		println("notshiftRA_ssa(-42) =", got, " want ", want)
+		failed = true
+	}
+	s := uint8(3)
+	if want, got := a+b<<s, addshiftLLreg_ssa(a, b, s); got != want {
+		println("addshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b<<s, subshiftLLreg_ssa(a, b, s); got != want {
+		println("subshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a<<s-b, rsbshiftLLreg_ssa(a, b, s); got != want {
+		println("rsbshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b<<s), andshiftLLreg_ssa(a, b, s); got != want {
+		println("andshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b<<s, orshiftLLreg_ssa(a, b, s); got != want {
+		println("orshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b<<s, xorshiftLLreg_ssa(a, b, s); got != want {
+		println("xorshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b<<s), bicshiftLLreg_ssa(a, b, s); got != want {
+		println("bicshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a << s), notshiftLLreg_ssa(a, s); got != want {
+		println("notshiftLLreg_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a+b>>s, addshiftRLreg_ssa(a, b, s); got != want {
+		println("addshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b>>s, subshiftRLreg_ssa(a, b, s); got != want {
+		println("subshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a>>s-b, rsbshiftRLreg_ssa(a, b, s); got != want {
+		println("rsbshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b>>s), andshiftRLreg_ssa(a, b, s); got != want {
+		println("andshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b>>s, orshiftRLreg_ssa(a, b, s); got != want {
+		println("orshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b>>s, xorshiftRLreg_ssa(a, b, s); got != want {
+		println("xorshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b>>s), bicshiftRLreg_ssa(a, b, s); got != want {
+		println("bicshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a >> s), notshiftRLreg_ssa(a, s); got != want {
+		println("notshiftRLreg_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c+d>>s, addshiftRAreg_ssa(c, d, s); got != want {
+		println("addshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c-d>>s, subshiftRAreg_ssa(c, d, s); got != want {
+		println("subshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c>>s-d, rsbshiftRAreg_ssa(c, d, s); got != want {
+		println("rsbshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&(d>>s), andshiftRAreg_ssa(c, d, s); got != want {
+		println("andshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c|d>>s, orshiftRAreg_ssa(c, d, s); got != want {
+		println("orshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c^d>>s, xorshiftRAreg_ssa(c, d, s); got != want {
+		println("xorshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&^(d>>s), bicshiftRAreg_ssa(c, d, s); got != want {
+		println("bicshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(d >> s), notshiftRAreg_ssa(d, s); got != want {
+		println("notshiftRAreg_ssa(-42, 3) =", got, " want ", want)
+		failed = true
+	}
+}
+
 var failed = false

 func main() {
@@ -573,6 +1012,7 @@ func main() {
 	testLoadCombine()
 	testLoadSymCombine()
 	testShiftRemoval()
+	testShiftedOps()

 	if failed {
 		panic("failed")
--- a/src/cmd/compile/internal/gc/testdata/gen/constFoldGen.go
+++ b/src/cmd/compile/internal/gc/testdata/gen/constFoldGen.go
@@ -144,7 +144,7 @@ func main() {
 					fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 					want := ansU(c, d, s.name, o.symbol)
 					fmt.Fprintf(w, "\tif r != %s {\n", want)
-					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 					fmt.Fprintf(w, "\t}\n")
 				}
 			}
@@ -159,7 +159,7 @@ func main() {
 					fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 					want := ansS(c, d, s.name, o.symbol)
 					fmt.Fprintf(w, "\tif r != %s {\n", want)
-					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 					fmt.Fprintf(w, "\t}\n")
 				}
 			}
@@ -188,7 +188,7 @@ func main() {
 						fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 						want := ansU(c, d, ls.name, o.symbol)
 						fmt.Fprintf(w, "\tif r != %s {\n", want)
-						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 						fmt.Fprintf(w, "\t}\n")
 					}
 				}
@@ -200,7 +200,7 @@ func main() {
 						fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 						want := ansS(c, int64(d), ls.name, o.symbol)
 						fmt.Fprintf(w, "\tif r != %s {\n", want)
-						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 						fmt.Fprintf(w, "\t}\n")
 					}
 				}
--- a/src/cmd/compile/internal/gc/testdata/string_ssa.go
+++ b/src/cmd/compile/internal/gc/testdata/string_ssa.go
@@ -110,6 +110,67 @@ func testSmallIndexType() {
 	}
 }

+//go:noinline
+func testInt64Index_ssa(s string, i int64) byte {
+	return s[i]
+}
+
+//go:noinline
+func testInt64Slice_ssa(s string, i, j int64) string {
+	return s[i:j]
+}
+
+func testInt64Index() {
+	tests := []struct {
+		i int64
+		j int64
+		b byte
+		s string
+	}{
+		{0, 5, 'B', "Below"},
+		{5, 10, 'E', "Exact"},
+		{10, 15, 'A', "Above"},
+	}
+
+	str := "BelowExactAbove"
+	for i, t := range tests {
+		if got := testInt64Index_ssa(str, t.i); got != t.b {
+			println("#", i, "got ", got, ", wanted", t.b)
+			failed = true
+		}
+		if got := testInt64Slice_ssa(str, t.i, t.j); got != t.s {
+			println("#", i, "got ", got, ", wanted", t.s)
+			failed = true
+		}
+	}
+}
+
+func testInt64IndexPanic() {
+	defer func() {
+		if r := recover(); r != nil {
+			println("paniced as expected")
+		}
+	}()
+
+	str := "foobar"
+	println("got ", testInt64Index_ssa(str, 1<<32+1))
+	println("expected to panic, but didn't")
+	failed = true
+}
+
+func testInt64SlicePanic() {
+	defer func() {
+		if r := recover(); r != nil {
+			println("paniced as expected")
+		}
+	}()
+
+	str := "foobar"
+	println("got ", testInt64Slice_ssa(str, 1<<32, 1<<32+1))
+	println("expected to panic, but didn't")
+	failed = true
+}
+
 //go:noinline
 func testStringElem_ssa(s string, i int) byte {
 	return s[i]
@@ -153,6 +214,9 @@ func main() {
 	testSmallIndexType()
 	testStringElem()
 	testStringElemConst()
+	testInt64Index()
+	testInt64IndexPanic()
+	testInt64SlicePanic()

 	if failed {
 		panic("failed")
--- a/src/cmd/compile/internal/gc/type.go
+++ b/src/cmd/compile/internal/gc/type.go
@@ -1207,6 +1207,7 @@ func (t *Type) ChanDir() ChanDir {
 func (t *Type) IsMemory() bool { return false }
 func (t *Type) IsFlags() bool  { return false }
 func (t *Type) IsVoid() bool   { return false }
+func (t *Type) IsTuple() bool  { return false }

 // IsUntyped reports whether t is an untyped type.
 func (t *Type) IsUntyped() bool {
--- a/src/cmd/compile/internal/gc/typecheck.go
+++ b/src/cmd/compile/internal/gc/typecheck.go
@@ -3185,6 +3185,9 @@ func samesafeexpr(l *Node, r *Node) bool {

 	case OINDEX:
 		return samesafeexpr(l.Left, r.Left) && samesafeexpr(l.Right, r.Right)
+
+	case OLITERAL:
+		return eqval(l.Val(), r.Val())
 	}

 	return false
--- a/src/cmd/compile/internal/gc/walk.go
+++ b/src/cmd/compile/internal/gc/walk.go
@@ -506,7 +506,9 @@ func walkexpr(n *Node, init *Nodes) *Node {
 	if n.Op == ONAME && n.Class == PAUTOHEAP {
 		nn := Nod(OIND, n.Name.Heapaddr, nil)
 		nn = typecheck(nn, Erv)
-		return walkexpr(nn, init)
+		nn = walkexpr(nn, init)
+		nn.Left.NonNil = true
+		return nn
 	}

 opswitch:
@@ -958,19 +960,27 @@ opswitch:
 		fromKind := from.Type.iet()
 		toKind := t.iet()

+		res := n.List.First()
+
 		// Avoid runtime calls in a few cases of the form _, ok := i.(T).
 		// This is faster and shorter and allows the corresponding assertX2X2
 		// routines to skip nil checks on their last argument.
-		if isblank(n.List.First()) {
+		if isblank(res) {
 			var fast *Node
-			switch {
-			case fromKind == 'E' && toKind == 'T':
-				tab := Nod(OITAB, from, nil) // type:eface::tab:iface
-				typ := Nod(OCONVNOP, typename(t), nil)
-				typ.Type = Ptrto(Types[TUINTPTR])
-				fast = Nod(OEQ, tab, typ)
-			case fromKind == 'I' && toKind == 'E',
-				fromKind == 'E' && toKind == 'E':
+			switch toKind {
+			case 'T':
+				tab := Nod(OITAB, from, nil)
+				if fromKind == 'E' {
+					typ := Nod(OCONVNOP, typename(t), nil)
+					typ.Type = Ptrto(Types[TUINTPTR])
+					fast = Nod(OEQ, tab, typ)
+					break
+				}
+				fast = Nod(OANDAND,
+					Nod(ONE, nodnil(), tab),
+					Nod(OEQ, itabType(tab), typename(t)),
+				)
+			case 'E':
 				tab := Nod(OITAB, from, nil)
 				fast = Nod(ONE, nodnil(), tab)
 			}
@@ -985,10 +995,10 @@ opswitch:
 		}

 		var resptr *Node // &res
-		if isblank(n.List.First()) {
+		if isblank(res) {
 			resptr = nodnil()
 		} else {
-			resptr = Nod(OADDR, n.List.First(), nil)
+			resptr = Nod(OADDR, res, nil)
 		}
 		resptr.Etype = 1 // addr does not escape

@@ -1094,12 +1104,45 @@ opswitch:

 			if n.Type.IsFloat() {
 				if n.Left.Type.Etype == TINT64 {
-					n = mkcall("int64tofloat64", n.Type, init, conv(n.Left, Types[TINT64]))
+					n = conv(mkcall("int64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TINT64])), n.Type)
 					break
 				}

 				if n.Left.Type.Etype == TUINT64 {
-					n = mkcall("uint64tofloat64", n.Type, init, conv(n.Left, Types[TUINT64]))
+					n = conv(mkcall("uint64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TUINT64])), n.Type)
+					break
+				}
+			}
+		}
+
+		if Thearch.LinkArch.Family == sys.I386 {
+			if n.Left.Type.IsFloat() {
+				if n.Type.Etype == TINT64 {
+					n = mkcall("float64toint64", n.Type, init, conv(n.Left, Types[TFLOAT64]))
+					break
+				}
+
+				if n.Type.Etype == TUINT64 {
+					n = mkcall("float64touint64", n.Type, init, conv(n.Left, Types[TFLOAT64]))
+					break
+				}
+				if n.Type.Etype == TUINT32 || n.Type.Etype == TUINT || n.Type.Etype == TUINTPTR {
+					n = mkcall("float64touint32", n.Type, init, conv(n.Left, Types[TFLOAT64]))
+					break
+				}
+			}
+			if n.Type.IsFloat() {
+				if n.Left.Type.Etype == TINT64 {
+					n = conv(mkcall("int64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TINT64])), n.Type)
+					break
+				}
+
+				if n.Left.Type.Etype == TUINT64 {
+					n = conv(mkcall("uint64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TUINT64])), n.Type)
+					break
+				}
+				if n.Left.Type.Etype == TUINT32 || n.Left.Type.Etype == TUINT || n.Left.Type.Etype == TUINTPTR {
+					n = conv(mkcall("uint32tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TUINT32])), n.Type)
 					break
 				}
 			}
@@ -3303,6 +3346,7 @@ func samecheap(a *Node, b *Node) bool {
 // The result of walkrotate MUST be assigned back to n, e.g.
 // 	n.Left = walkrotate(n.Left)
 func walkrotate(n *Node) *Node {
+	//TODO: enable LROT on ARM64 once the old backend is gone
 	if Thearch.LinkArch.InFamily(sys.MIPS64, sys.ARM64, sys.PPC64) {
 		return n
 	}
@@ -3496,16 +3540,6 @@ func walkdiv(n *Node, init *Nodes) *Node {
 			goto ret
 		}

-		// TODO(zhongwei) Test shows that TUINT8, TINT8, TUINT16 and TINT16's "quick division" method
-		// on current arm64 backend is slower than hardware div instruction on ARM64 due to unnecessary
-		// data movement between registers. It could be enabled when generated code is good enough.
-		if Thearch.LinkArch.Family == sys.ARM64 {
-			switch Simtype[nl.Type.Etype] {
-			case TUINT8, TINT8, TUINT16, TINT16:
-				return n
-			}
-		}
-
 		switch Simtype[nl.Type.Etype] {
 		default:
 			return n
--- a/src/cmd/compile/internal/mips64/peep.go
+++ b/src/cmd/compile/internal/mips64/peep.go
@@ -402,7 +402,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case obj.ANOP, /* read p->from, write p->to */
--- a/src/cmd/compile/internal/ppc64/galign.go
+++ b/src/cmd/compile/internal/ppc64/galign.go
@@ -66,6 +66,11 @@ func Main() {
 	gc.Thearch.Doregbits = doregbits
 	gc.Thearch.Regnames = regnames

+	gc.Thearch.SSARegToReg = ssaRegToReg
+	gc.Thearch.SSAMarkMoves = ssaMarkMoves
+	gc.Thearch.SSAGenValue = ssaGenValue
+	gc.Thearch.SSAGenBlock = ssaGenBlock
+
 	initvariants()
 	initproginfo()

--- a/src/cmd/compile/internal/ppc64/peep.go
+++ b/src/cmd/compile/internal/ppc64/peep.go
@@ -601,7 +601,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case obj.ANOP, /* read p->from, write p->to */
--- a/src/cmd/compile/internal/ppc64/prog.go
+++ b/src/cmd/compile/internal/ppc64/prog.go
@@ -42,22 +42,34 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{

 	// Integer
 	ppc64.AADD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AADDC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASUB & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AADDME & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ANEG & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AAND & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AANDN & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AOR & obj.AMask:     {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AORN & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AXOR & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AEQV & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULLD & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULLW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULHD & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULHDU & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ADIVD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ADIVDU & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ADIVW & obj.AMask:   {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ADIVWU & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASLD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASRD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASRAD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ASLW & obj.AMask:    {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ASRW & obj.AMask:    {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ASRAW & obj.AMask:   {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ACMP & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RightRead},
 	ppc64.ACMPU & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightRead},
+	ppc64.ACMPW & obj.AMask:   {Flags: gc.SizeL | gc.LeftRead | gc.RightRead},
+	ppc64.ACMPWU & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RightRead},
 	ppc64.ATD & obj.AMask:     {Flags: gc.SizeQ | gc.RightRead},

 	// Floating point.
@@ -70,11 +82,13 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{
 	ppc64.AFDIV & obj.AMask:   {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFDIVS & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCTIDZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AFCTIWZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCFID & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCFIDU & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCMPU & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RightRead},
 	ppc64.AFRSP & obj.AMask:   {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv},
 	ppc64.AFSQRT & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite},
+	ppc64.AFNEG & obj.AMask:   {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite},

 	// Moves
 	ppc64.AMOVB & obj.AMask:  {Flags: gc.SizeB | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
@@ -91,6 +105,8 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{
 	ppc64.AMOVD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move},
 	ppc64.AMOVDU & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move | gc.PostInc},
 	ppc64.AFMOVS & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
+	ppc64.AFMOVSX & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
+	ppc64.AFMOVSZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
 	ppc64.AFMOVD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Move},

 	// Jumps
@@ -297,7 +313,7 @@ func as2variant(as obj.As) int {
 			return i
 		}
 	}
-	gc.Fatalf("as2variant: instruction %v is not a variant of itself", obj.Aconv(as&obj.AMask))
+	gc.Fatalf("as2variant: instruction %v is not a variant of itself", as&obj.AMask)
 	return 0
 }

--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
--- a/src/cmd/compile/internal/s390x/peep.go
+++ b/src/cmd/compile/internal/s390x/peep.go
@@ -419,7 +419,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) usage {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return _ReadWriteSame

 	case // read p.From, write p.To
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -20,11 +20,18 @@ type Config struct {
 	lowerBlock      func(*Block) bool          // lowering function
 	lowerValue      func(*Value, *Config) bool // lowering function
 	registers       []Register                 // machine registers
+	gpRegMask       regMask                    // general purpose integer register mask
+	fpRegMask       regMask                    // floating point register mask
+	FPReg           int8                       // register number of frame pointer, -1 if not used
+	hasGReg         bool                       // has hardware g register
 	fe              Frontend                   // callbacks into compiler frontend
 	HTML            *HTMLWriter                // html writer, for debugging
 	ctxt            *obj.Link                  // Generic arch information
 	optimize        bool                       // Do optimization
 	noDuffDevice    bool                       // Don't use Duff's device
+	nacl            bool                       // GOOS=nacl
+	use387          bool                       // GO386=387
+	NeedsFpScratch  bool                       // No direct move between GP and FP register sets
 	sparsePhiCutoff uint64                     // Sparse phi location algorithm used above this #blocks*#variables score
 	curFunc         *Func

@@ -106,6 +113,7 @@ type Frontend interface {
 	SplitSlice(LocalSlot) (LocalSlot, LocalSlot, LocalSlot)
 	SplitComplex(LocalSlot) (LocalSlot, LocalSlot)
 	SplitStruct(LocalSlot, int) LocalSlot
+	SplitInt64(LocalSlot) (LocalSlot, LocalSlot) // returns (hi, lo)

 	// Line returns a string describing the given line number.
 	Line(int32) string
@@ -128,29 +136,88 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 		c.lowerBlock = rewriteBlockAMD64
 		c.lowerValue = rewriteValueAMD64
 		c.registers = registersAMD64[:]
-	case "386":
+		c.gpRegMask = gpRegMaskAMD64
+		c.fpRegMask = fpRegMaskAMD64
+		c.FPReg = framepointerRegAMD64
+		c.hasGReg = false
+	case "amd64p32":
 		c.IntSize = 4
 		c.PtrSize = 4
 		c.lowerBlock = rewriteBlockAMD64
-		c.lowerValue = rewriteValueAMD64 // TODO(khr): full 32-bit support
+		c.lowerValue = rewriteValueAMD64
+		c.registers = registersAMD64[:]
+		c.gpRegMask = gpRegMaskAMD64
+		c.fpRegMask = fpRegMaskAMD64
+		c.FPReg = framepointerRegAMD64
+		c.hasGReg = false
+		c.noDuffDevice = true
+	case "386":
+		c.IntSize = 4
+		c.PtrSize = 4
+		c.lowerBlock = rewriteBlock386
+		c.lowerValue = rewriteValue386
+		c.registers = registers386[:]
+		c.gpRegMask = gpRegMask386
+		c.fpRegMask = fpRegMask386
+		c.FPReg = framepointerReg386
+		c.hasGReg = false
 	case "arm":
 		c.IntSize = 4
 		c.PtrSize = 4
 		c.lowerBlock = rewriteBlockARM
 		c.lowerValue = rewriteValueARM
 		c.registers = registersARM[:]
+		c.gpRegMask = gpRegMaskARM
+		c.fpRegMask = fpRegMaskARM
+		c.FPReg = framepointerRegARM
+		c.hasGReg = true
+	case "arm64":
+		c.IntSize = 8
+		c.PtrSize = 8
+		c.lowerBlock = rewriteBlockARM64
+		c.lowerValue = rewriteValueARM64
+		c.registers = registersARM64[:]
+		c.gpRegMask = gpRegMaskARM64
+		c.fpRegMask = fpRegMaskARM64
+		c.FPReg = framepointerRegARM64
+		c.hasGReg = true
+		c.noDuffDevice = obj.Getgoos() == "darwin" // darwin linker cannot handle BR26 reloc with non-zero addend
+	case "ppc64le":
+		c.IntSize = 8
+		c.PtrSize = 8
+		c.lowerBlock = rewriteBlockPPC64
+		c.lowerValue = rewriteValuePPC64
+		c.registers = registersPPC64[:]
+		c.gpRegMask = gpRegMaskPPC64
+		c.fpRegMask = fpRegMaskPPC64
+		c.FPReg = framepointerRegPPC64
+		c.noDuffDevice = true // TODO: Resolve PPC64 DuffDevice (has zero, but not copy)
+		c.NeedsFpScratch = true
+		c.hasGReg = true
 	default:
 		fe.Unimplementedf(0, "arch %s not implemented", arch)
 	}
 	c.ctxt = ctxt
 	c.optimize = optimize
+	c.nacl = obj.Getgoos() == "nacl"

-	// Don't use Duff's device on Plan 9, because floating
+	// Don't use Duff's device on Plan 9 AMD64, because floating
 	// point operations are not allowed in note handler.
-	if obj.Getgoos() == "plan9" {
+	if obj.Getgoos() == "plan9" && arch == "amd64" {
 		c.noDuffDevice = true
 	}

+	if c.nacl {
+		c.noDuffDevice = true // Don't use Duff's device on NaCl
+
+		// ARM assembler rewrites DIV/MOD to runtime calls, which
+		// clobber R12 on nacl
+		opcodeTable[OpARMDIV].reg.clobbers |= 1 << 12  // R12
+		opcodeTable[OpARMDIVU].reg.clobbers |= 1 << 12 // R12
+		opcodeTable[OpARMMOD].reg.clobbers |= 1 << 12  // R12
+		opcodeTable[OpARMMODU].reg.clobbers |= 1 << 12 // R12
+	}
+
 	// Assign IDs to preallocated values/blocks.
 	for i := range c.values {
 		c.values[i].ID = ID(i)
@@ -180,6 +247,11 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 	return c
 }

+func (c *Config) Set387(b bool) {
+	c.NeedsFpScratch = b
+	c.use387 = b
+}
+
 func (c *Config) Frontend() Frontend      { return c.fe }
 func (c *Config) SparsePhiCutoff() uint64 { return c.sparsePhiCutoff }

--- a/src/cmd/compile/internal/ssa/cse.go
+++ b/src/cmd/compile/internal/ssa/cse.go
@@ -163,6 +163,29 @@ func cse(f *Func) {
 		}
 	}

+	// if we rewrite a tuple generator to a new one in a different block,
+	// copy its selectors to the new generator's block, so tuple generator
+	// and selectors stay together.
+	for _, b := range f.Blocks {
+		for _, v := range b.Values {
+			if rewrite[v.ID] != nil {
+				continue
+			}
+			if v.Op != OpSelect0 && v.Op != OpSelect1 {
+				continue
+			}
+			if !v.Args[0].Type.IsTuple() {
+				f.Fatalf("arg of tuple selector %s is not a tuple: %s", v.String(), v.Args[0].LongString())
+			}
+			t := rewrite[v.Args[0].ID]
+			if t != nil && t.Block != b {
+				// v.Args[0] is tuple generator, CSE'd into a different block as t, v is left behind
+				c := v.copyInto(t.Block)
+				rewrite[v.ID] = c
+			}
+		}
+	}
+
 	rewrites := int64(0)

 	// Apply substitutions
--- a/src/cmd/compile/internal/ssa/deadstore.go
+++ b/src/cmd/compile/internal/ssa/deadstore.go
@@ -89,7 +89,7 @@ func dse(f *Func) {
 				} else {
 					// zero addr mem
 					sz := v.Args[0].Type.ElemType().Size()
-					if v.AuxInt != sz {
+					if SizeAndAlign(v.AuxInt).Size() != sz {
 						f.Fatalf("mismatched zero/store sizes: %d and %d [%s]",
 							v.AuxInt, sz, v.LongString())
 					}
--- a/src/cmd/compile/internal/ssa/decompose.go
+++ b/src/cmd/compile/internal/ssa/decompose.go
@@ -25,6 +25,22 @@ func decomposeBuiltIn(f *Func) {
 	for _, name := range f.Names {
 		t := name.Type
 		switch {
+		case t.IsInteger() && t.Size() == 8 && f.Config.IntSize == 4:
+			var elemType Type
+			if t.IsSigned() {
+				elemType = f.Config.fe.TypeInt32()
+			} else {
+				elemType = f.Config.fe.TypeUInt32()
+			}
+			hiName, loName := f.Config.fe.SplitInt64(name)
+			newNames = append(newNames, hiName, loName)
+			for _, v := range f.NamedValues[name] {
+				hi := v.Block.NewValue1(v.Line, OpInt64Hi, elemType, v)
+				lo := v.Block.NewValue1(v.Line, OpInt64Lo, f.Config.fe.TypeUInt32(), v)
+				f.NamedValues[hiName] = append(f.NamedValues[hiName], hi)
+				f.NamedValues[loName] = append(f.NamedValues[loName], lo)
+			}
+			delete(f.NamedValues, name)
 		case t.IsComplex():
 			var elemType Type
 			if t.Size() == 16 {
@@ -78,6 +94,8 @@ func decomposeBuiltIn(f *Func) {
 				f.NamedValues[dataName] = append(f.NamedValues[dataName], data)
 			}
 			delete(f.NamedValues, name)
+		case t.IsFloat():
+			// floats are never decomposed, even ones bigger than IntSize
 		case t.Size() > f.Config.IntSize:
 			f.Unimplementedf("undecomposed named type %s %s", name, t)
 		default:
@@ -88,8 +106,13 @@ func decomposeBuiltIn(f *Func) {
 }

 func decomposeBuiltInPhi(v *Value) {
-	// TODO: decompose 64-bit ops on 32-bit archs?
 	switch {
+	case v.Type.IsInteger() && v.Type.Size() == 8 && v.Block.Func.Config.IntSize == 4:
+		if v.Block.Func.Config.arch == "amd64p32" {
+			// Even though ints are 32 bits, we have 64-bit ops.
+			break
+		}
+		decomposeInt64Phi(v)
 	case v.Type.IsComplex():
 		decomposeComplexPhi(v)
 	case v.Type.IsString():
@@ -98,6 +121,8 @@ func decomposeBuiltInPhi(v *Value) {
 		decomposeSlicePhi(v)
 	case v.Type.IsInterface():
 		decomposeInterfacePhi(v)
+	case v.Type.IsFloat():
+		// floats are never decomposed, even ones bigger than IntSize
 	case v.Type.Size() > v.Block.Func.Config.IntSize:
 		v.Unimplementedf("undecomposed type %s", v.Type)
 	}
@@ -138,6 +163,26 @@ func decomposeSlicePhi(v *Value) {
 	v.AddArg(cap)
 }

+func decomposeInt64Phi(v *Value) {
+	fe := v.Block.Func.Config.fe
+	var partType Type
+	if v.Type.IsSigned() {
+		partType = fe.TypeInt32()
+	} else {
+		partType = fe.TypeUInt32()
+	}
+
+	hi := v.Block.NewValue0(v.Line, OpPhi, partType)
+	lo := v.Block.NewValue0(v.Line, OpPhi, fe.TypeUInt32())
+	for _, a := range v.Args {
+		hi.AddArg(a.Block.NewValue1(v.Line, OpInt64Hi, partType, a))
+		lo.AddArg(a.Block.NewValue1(v.Line, OpInt64Lo, fe.TypeUInt32(), a))
+	}
+	v.reset(OpInt64Make)
+	v.AddArg(hi)
+	v.AddArg(lo)
+}
+
 func decomposeComplexPhi(v *Value) {
 	fe := v.Block.Func.Config.fe
 	var partType Type
--- a/src/cmd/compile/internal/ssa/export_test.go
+++ b/src/cmd/compile/internal/ssa/export_test.go
@@ -49,6 +49,12 @@ func (d DummyFrontend) SplitComplex(s LocalSlot) (LocalSlot, LocalSlot) {
 	}
 	return LocalSlot{s.N, d.TypeFloat32(), s.Off}, LocalSlot{s.N, d.TypeFloat32(), s.Off + 4}
 }
+func (d DummyFrontend) SplitInt64(s LocalSlot) (LocalSlot, LocalSlot) {
+	if s.Type.IsSigned() {
+		return LocalSlot{s.N, d.TypeInt32(), s.Off + 4}, LocalSlot{s.N, d.TypeUInt32(), s.Off}
+	}
+	return LocalSlot{s.N, d.TypeUInt32(), s.Off + 4}, LocalSlot{s.N, d.TypeUInt32(), s.Off}
+}
 func (d DummyFrontend) SplitStruct(s LocalSlot, i int) LocalSlot {
 	return LocalSlot{s.N, s.Type.FieldType(i), s.Off + s.Type.FieldOff(i)}
 }
--- a/src/cmd/compile/internal/ssa/flagalloc.go
+++ b/src/cmd/compile/internal/ssa/flagalloc.go
@@ -4,8 +4,6 @@

 package ssa

-const flagRegMask = regMask(1) << 33 // TODO: arch-specific
-
 // flagalloc allocates the flag register among all the flag-generating
 // instructions. Flag values are recomputed if they need to be
 // spilled/restored.
@@ -33,7 +31,7 @@ func flagalloc(f *Func) {
 				if v == flag {
 					flag = nil
 				}
-				if opcodeTable[v.Op].reg.clobbers&flagRegMask != 0 {
+				if opcodeTable[v.Op].clobberFlags {
 					flag = nil
 				}
 				for _, a := range v.Args {
@@ -97,7 +95,7 @@ func flagalloc(f *Func) {
 					continue
 				}
 				// Recalculate a
-				c := a.copyInto(b)
+				c := copyFlags(a, b)
 				// Update v.
 				v.SetArg(i, c)
 				// Remember the most-recently computed flag value.
@@ -105,7 +103,7 @@ func flagalloc(f *Func) {
 			}
 			// Issue v.
 			b.Values = append(b.Values, v)
-			if opcodeTable[v.Op].reg.clobbers&flagRegMask != 0 {
+			if opcodeTable[v.Op].clobberFlags {
 				flag = nil
 			}
 			if v.Type.IsFlags() {
@@ -121,7 +119,7 @@ func flagalloc(f *Func) {
 		if v := end[b.ID]; v != nil && v != flag {
 			// Need to reissue flag generator for use by
 			// subsequent blocks.
-			_ = v.copyInto(b)
+			copyFlags(v, b)
 			// Note: this flag generator is not properly linked up
 			// with the flag users. This breaks the SSA representation.
 			// We could fix up the users with another pass, but for now
@@ -135,3 +133,19 @@ func flagalloc(f *Func) {
 		b.FlagsLiveAtEnd = end[b.ID] != nil
 	}
 }
+
+// copyFlags copies v (flag generator) into b, returns the copy.
+// If v's arg is also flags, copy recursively.
+func copyFlags(v *Value, b *Block) *Value {
+	flagsArgs := make(map[int]*Value)
+	for i, a := range v.Args {
+		if a.Type.IsFlags() || a.Type.IsTuple() {
+			flagsArgs[i] = copyFlags(a, b)
+		}
+	}
+	c := v.copyInto(b)
+	for i, a := range flagsArgs {
+		c.SetArg(i, a)
+	}
+	return c
+}
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -0,0 +1,508 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - Floating-point types live in the low natural slot of an sse2 register.
+//    Unused portions are junk.
+//  - We do not use AH,BH,CH,DH registers.
+//  - When doing sub-register operations, we try to write the whole
+//    destination register to avoid a partial-register write.
+//  - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+//    filled by sign-extending the used portion.  Users of AuxInt which interpret
+//    AuxInt as unsigned (e.g. shifts) must be careful.
+
+// Suffixes encode the bit width of various instructions.
+// L (long word) = 32 bit
+// W (word)      = 16 bit
+// B (byte)      = 8 bit
+
+// copied from ../../x86/reg.go
+var regNames386 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+
+	// pseudo-registers
+	"SB",
+}
+
+// Notes on 387 support.
+//  - The 387 has a weird stack-register setup for floating-point registers.
+//    We use these registers when SSE registers are not available (when GO386=387).
+//  - We use the same register names (X0-X7) but they refer to the 387
+//    floating-point registers. That way, most of the SSA backend is unchanged.
+//  - The instruction generation pass maintains an SSE->387 register mapping.
+//    This mapping is updated whenever the FP stack is pushed or popped so that
+//    we can always find a given SSE register even when the TOS pointer has changed.
+//  - To facilitate the mapping from SSE to 387, we enforce that
+//    every basic block starts and ends with an empty floating-point stack.
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNames386) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNames386 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		ax         = buildReg("AX")
+		cx         = buildReg("CX")
+		dx         = buildReg("DX")
+		gp         = buildReg("AX CX DX BX BP SI DI")
+		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7")
+		x7         = buildReg("X7")
+		gpsp       = gp | buildReg("SP")
+		gpspsb     = gpsp | buildReg("SB")
+		callerSave = gp | fp
+	)
+	// Common slices of register masks
+	var (
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
+	)
+
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: gponly}
+		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
+		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp11carry = regInfo{inputs: []regMask{gp}, outputs: []regMask{0, gp}}
+		gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{0, gp}}
+		gp1carry1 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp2carry1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
+		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax}, clobbers: dx}
+		gp21hmul  = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
+		gp11mod   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx}, clobbers: ax}
+		gp21mul   = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}
+
+		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags = regInfo{inputs: []regMask{gpsp}}
+		flagsgp  = regInfo{inputs: nil, outputs: gponly}
+
+		readflags = regInfo{inputs: nil, outputs: gponly}
+		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
+
+		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+
+		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
+		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+
+		fp01   = regInfo{inputs: nil, outputs: fponly}
+		fp21   = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp21x7 = regInfo{inputs: []regMask{fp &^ x7, fp &^ x7},
+			clobbers: x7, outputs: []regMask{fp &^ x7}}
+		fpgp     = regInfo{inputs: fponly, outputs: gponly}
+		gpfp     = regInfo{inputs: gponly, outputs: fponly}
+		fp11     = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags = regInfo{inputs: []regMask{fp, fp}}
+
+		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+		fpstore    = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+		fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+	)
+
+	var _386ops = []opData{
+		// fp ops
+		{name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add
+		{name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add
+		{name: "SUBSS", argLength: 2, reg: fp21x7, asm: "SUBSS", resultInArg0: true},                  // fp32 sub
+		{name: "SUBSD", argLength: 2, reg: fp21x7, asm: "SUBSD", resultInArg0: true},                  // fp64 sub
+		{name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul
+		{name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul
+		{name: "DIVSS", argLength: 2, reg: fp21x7, asm: "DIVSS", resultInArg0: true},                  // fp32 div
+		{name: "DIVSD", argLength: 2, reg: fp21x7, asm: "DIVSD", resultInArg0: true},                  // fp64 div
+
+		{name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff"},            // fp32 load
+		{name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff"},            // fp64 load
+		{name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, // fp32 constant
+		{name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, // fp64 constant
+		{name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff"},     // fp32 load indexed by i
+		{name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff"},     // fp32 load indexed by 4*i
+		{name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff"},     // fp64 load indexed by i
+		{name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff"},     // fp64 load indexed by 8*i
+
+		{name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff"},        // fp32 store
+		{name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff"},        // fp64 store
+		{name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff"}, // fp32 indexed by i store
+		{name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff"}, // fp32 indexed by 4i store
+		{name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by i store
+		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by 8i store
+
+		// binary ops
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32", clobberFlags: true}, // arg0 + auxint
+
+		{name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true},                // arg0 + arg1, generates <carry,result> pair
+		{name: "ADDLconstcarry", argLength: 1, reg: gp11carry, asm: "ADDL", aux: "Int32", resultInArg0: true},                // arg0 + auxint, generates <carry,result> pair
+		{name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0+arg1+carry(arg2), where arg2 is flags
+		{name: "ADCLconst", argLength: 2, reg: gp1carry1, asm: "ADCL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0+auxint+carry(arg1), where arg1 is flags
+
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+
+		{name: "SUBLcarry", argLength: 2, reg: gp21carry, asm: "SUBL", resultInArg0: true},                                   // arg0-arg1, generates <borrow,result> pair
+		{name: "SUBLconstcarry", argLength: 1, reg: gp11carry, asm: "SUBL", aux: "Int32", resultInArg0: true},                // arg0-auxint, generates <borrow,result> pair
+		{name: "SBBL", argLength: 3, reg: gp2carry1, asm: "SBBL", resultInArg0: true, clobberFlags: true},                    // arg0-arg1-borrow(arg2), where arg2 is flags
+		{name: "SBBLconst", argLength: 2, reg: gp1carry1, asm: "SBBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0-auxint-borrow(arg1), where arg1 is flags
+
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 * auxint
+
+		{name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULW", argLength: 2, reg: gp21hmul, asm: "IMULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULB", argLength: 2, reg: gp21hmul, asm: "IMULB", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULWU", argLength: 2, reg: gp21hmul, asm: "MULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULBU", argLength: 2, reg: gp21hmul, asm: "MULB", clobberFlags: true}, // (arg0 * arg1) >> width
+
+		{name: "MULLQU", argLength: 2, reg: gp21mul, asm: "MULL", clobberFlags: true}, // arg0 * arg1, high 32 in result[0], low 32 in result[1]
+
+		{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW", clobberFlags: true}, // arg0 / arg1
+
+		{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL", clobberFlags: true}, // arg0 % arg1
+		{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW", clobberFlags: true}, // arg0 % arg1
+		{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL", clobberFlags: true}, // arg0 % arg1
+		{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW", clobberFlags: true}, // arg0 % arg1
+
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+
+		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
+		{name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"},  // arg0 compare to auxint
+
+		{name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32
+		{name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64
+
+		{name: "TESTL", argLength: 2, reg: gp2flags, asm: "TESTL", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTW", argLength: 2, reg: gp2flags, asm: "TESTW", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTB", argLength: 2, reg: gp2flags, asm: "TESTB", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
+		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0
+
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 32
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
+		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // unsigned arg0 >> auxint, shift amount 0-31
+
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // signed arg0 >> auxint, shift amount 0-31
+
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // arg0 rotate left auxint, rotate amount 0-7
+
+		// unary ops
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
+
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true, clobberFlags: true}, // ^arg0
+
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+
+		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+
+		{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+		// Note: SBBW and SBBB are subsumed by SBBL
+
+		{name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+		{name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+		{name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"},  // extract signed < condition from arg0
+		{name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+		{name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"},  // extract signed > condition from arg0
+		{name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+		{name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"},  // extract unsigned < condition from arg0
+		{name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+		{name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"},  // extract unsigned > condition from arg0
+		{name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+		// Need different opcodes for floating point conditions because
+		// any comparison involving a NaN is always FALSE and thus
+		// the patterns for inverting conditions cannot be used.
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},                       // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},                       // extract "unordered" (Nan present) condition from arg0
+
+		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
+		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+		{name: "MOVBLSX", argLength: 1, reg: gp11, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32
+		{name: "MOVBLZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32
+		{name: "MOVWLSX", argLength: 1, reg: gp11, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32
+		{name: "MOVWLZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32
+
+		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+
+		{name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
+		{name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
+		{name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"},   // convert int32 to float32
+		{name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"},   // convert int32 to float64
+		{name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"},   // convert float64 to float32
+		{name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"},   // convert float32 to float64
+
+		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+		{name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+		{name: "LEAL1", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + arg1 + auxint + aux
+		{name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 8*arg1 + auxint + aux
+		// Note: LEAL{1,2,4,8} must not have OpSB as either argument.
+
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff"},             // ditto, sign extend to int32
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16"}, // load 2 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff"},             // ditto, sign extend to int32
+		{name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32"},    // load 4 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"},     // store byte in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"},     // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"},     // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+		// indexed loads/stores
+		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx1", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff"}, // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx1", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff"},    // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff"},    // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
+		// TODO: sign-extending indexed loads
+		{name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVB", aux: "SymOff"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+		// TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
+
+		// For storeconst ops, the AuxInt field encodes both
+		// the value to store and an address offset of the store.
+		// Cast AuxInt to a ValAndOff to extract Val and Off fields.
+		{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux.  arg1=mem
+		{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ...
+		{name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ...
+
+		{name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux.  arg2=mem
+		{name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ... arg1 ...
+		{name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ... 2*arg1 ...
+		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ...
+		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ...
+
+		// arg0 = pointer to start of memory to zero
+		// arg1 = value to store (will always be zero)
+		// arg2 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+				// Note: CX is only clobbered when dynamic linking.
+			},
+		},
+
+		// arg0 = address of memory to zero
+		// arg1 = # of 4-byte words to zero
+		// arg2 = value to store (will always be zero)
+		// arg3 = mem
+		// returns mem
+		{
+			name:      "REPSTOSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+			},
+		},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                             // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                               // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                  // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = mem
+		// auxint = offset from duffcopy symbol to call
+		// returns memory
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
+				clobbers: buildReg("DI SI CX"), // uses CX as a temporary
+			},
+			clobberFlags: true,
+		},
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = # of 8-byte words to copy
+		// arg3 = mem
+		// returns memory
+		{
+			name:      "REPMOVSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+				clobbers: buildReg("DI SI CX"),
+			},
+		},
+
+		// (InvertFlags (CMPL a b)) == (CMPL b a)
+		// So if we want (SETL (CMPL a b)) but we can't do that because a is a constant,
+		// then we do (SETL (InvertFlags (CMPL b a))) instead.
+		// Rewrites will convert this to (SETG (CMPL b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Pseudo-ops
+		{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of DX (the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}},
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true},
+
+		// MOVLconvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GC
+		// (particularly stack maps).  It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		// arg0=ptr/int arg1=mem, output=int/ptr
+		{name: "MOVLconvert", argLength: 2, reg: gp11, asm: "MOVL"},
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned <
+		{name: "FlagGT_ULT"}, // signed > and unsigned >
+
+		// Special op for -x on 387
+		{name: "FCHS", argLength: 1, reg: fp11},
+
+		// Special ops for PIC floating-point constants.
+		// MOVSXconst1 loads the address of the constant-pool entry into a register.
+		// MOVSXconst2 loads the constant from that address.
+		// MOVSXconst1 returns a pointer, but we type it as uint32 because it can never point to the Go heap.
+		{name: "MOVSSconst1", reg: gp01, typ: "UInt32", aux: "Float32"},
+		{name: "MOVSDconst1", reg: gp01, typ: "UInt32", aux: "Float64"},
+		{name: "MOVSSconst2", argLength: 1, reg: gpfp, asm: "MOVSS"},
+		{name: "MOVSDconst2", argLength: 1, reg: gpfp, asm: "MOVSD"},
+	}
+
+	var _386blocks = []blockData{
+		{name: "EQ"},
+		{name: "NE"},
+		{name: "LT"},
+		{name: "LE"},
+		{name: "GT"},
+		{name: "GE"},
+		{name: "ULT"},
+		{name: "ULE"},
+		{name: "UGT"},
+		{name: "UGE"},
+		{name: "EQF"},
+		{name: "NEF"},
+		{name: "ORD"}, // FP, ordered comparison (parity zero)
+		{name: "NAN"}, // FP, unordered comparison (parity one)
+	}
+
+	archs = append(archs, arch{
+		name:            "386",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../x86/ssa.go",
+		ops:             _386ops,
+		blocks:          _386blocks,
+		regnames:        regNames386,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["BP"]),
+	})
+}
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -4,7 +4,8 @@

 // Lowering arithmetic
 (Add64  x y) -> (ADDQ  x y)
-(AddPtr x y) -> (ADDQ  x y)
+(AddPtr x y) && config.PtrSize == 8 -> (ADDQ x y)
+(AddPtr x y) && config.PtrSize == 4 -> (ADDL x y)
 (Add32  x y) -> (ADDL  x y)
 (Add16  x y) -> (ADDL  x y)
 (Add8   x y) -> (ADDL  x y)
@@ -12,7 +13,8 @@
 (Add64F x y) -> (ADDSD x y)

 (Sub64  x y) -> (SUBQ  x y)
-(SubPtr x y) -> (SUBQ  x y)
+(SubPtr x y) && config.PtrSize == 8 -> (SUBQ x y)
+(SubPtr x y) && config.PtrSize == 4 -> (SUBL x y)
 (Sub32  x y) -> (SUBL  x y)
 (Sub16  x y) -> (SUBL  x y)
 (Sub8   x y) -> (SUBL  x y)
@@ -29,14 +31,14 @@
 (Div32F x y) -> (DIVSS x y)
 (Div64F x y) -> (DIVSD x y)

-(Div64  x y) -> (DIVQ  x y)
-(Div64u x y) -> (DIVQU x y)
-(Div32  x y) -> (DIVL  x y)
-(Div32u x y) -> (DIVLU x y)
-(Div16  x y) -> (DIVW  x y)
-(Div16u x y) -> (DIVWU x y)
-(Div8   x y) -> (DIVW  (SignExt8to16 x) (SignExt8to16 y))
-(Div8u  x y) -> (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+(Div64  x y) -> (Select0 (DIVQ  x y))
+(Div64u x y) -> (Select0 (DIVQU x y))
+(Div32  x y) -> (Select0 (DIVL  x y))
+(Div32u x y) -> (Select0 (DIVLU x y))
+(Div16  x y) -> (Select0 (DIVW  x y))
+(Div16u x y) -> (Select0 (DIVWU x y))
+(Div8   x y) -> (Select0 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Div8u  x y) -> (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))

 (Hmul64  x y) -> (HMULQ  x y)
 (Hmul64u x y) -> (HMULQU x y)
@@ -49,14 +51,14 @@

 (Avg64u x y) -> (AVGQU x y)

-(Mod64  x y) -> (MODQ  x y)
-(Mod64u x y) -> (MODQU x y)
-(Mod32  x y) -> (MODL  x y)
-(Mod32u x y) -> (MODLU x y)
-(Mod16  x y) -> (MODW  x y)
-(Mod16u x y) -> (MODWU x y)
-(Mod8   x y) -> (MODW  (SignExt8to16 x) (SignExt8to16 y))
-(Mod8u  x y) -> (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+(Mod64  x y) -> (Select1 (DIVQ  x y))
+(Mod64u x y) -> (Select1 (DIVQU x y))
+(Mod32  x y) -> (Select1 (DIVL  x y))
+(Mod32u x y) -> (Select1 (DIVLU x y))
+(Mod16  x y) -> (Select1 (DIVW  x y))
+(Mod16u x y) -> (Select1 (DIVWU x y))
+(Mod8   x y) -> (Select1 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Mod8u  x y) -> (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))

 (And64 x y) -> (ANDQ x y)
 (And32 x y) -> (ANDL x y)
@@ -91,8 +93,9 @@
 (Not x) -> (XORLconst [1] x)

 // Lowering pointer arithmetic
-(OffPtr [off] ptr) && is32Bit(off) -> (ADDQconst [off] ptr)
-(OffPtr [off] ptr) -> (ADDQ (MOVQconst [off]) ptr)
+(OffPtr [off] ptr) && config.PtrSize == 8 && is32Bit(off) -> (ADDQconst [off] ptr)
+(OffPtr [off] ptr) && config.PtrSize == 8 -> (ADDQ (MOVQconst [off]) ptr)
+(OffPtr [off] ptr) && config.PtrSize == 4 -> (ADDLconst [off] ptr)

 // Lowering other arithmetic
 // TODO: CMPQconst 0 below is redundant because BSF sets Z but how to remove?
@@ -270,7 +273,8 @@
 (Eq16  x y) -> (SETEQ (CMPW x y))
 (Eq8   x y) -> (SETEQ (CMPB x y))
 (EqB   x y) -> (SETEQ (CMPB x y))
-(EqPtr x y) -> (SETEQ (CMPQ x y))
+(EqPtr x y) && config.PtrSize == 8 -> (SETEQ (CMPQ x y))
+(EqPtr x y) && config.PtrSize == 4 -> (SETEQ (CMPL x y))
 (Eq64F x y) -> (SETEQF (UCOMISD x y))
 (Eq32F x y) -> (SETEQF (UCOMISS x y))

@@ -279,13 +283,16 @@
 (Neq16  x y) -> (SETNE (CMPW x y))
 (Neq8   x y) -> (SETNE (CMPB x y))
 (NeqB   x y) -> (SETNE (CMPB x y))
-(NeqPtr x y) -> (SETNE (CMPQ x y))
+(NeqPtr x y) && config.PtrSize == 8 -> (SETNE (CMPQ x y))
+(NeqPtr x y) && config.PtrSize == 4 -> (SETNE (CMPL x y))
 (Neq64F x y) -> (SETNEF (UCOMISD x y))
 (Neq32F x y) -> (SETNEF (UCOMISS x y))

+(Int64Hi x) -> (SHRQconst [32] x) // needed for amd64p32
+
 // Lowering loads
-(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVQload ptr mem)
-(Load <t> ptr mem) && is32BitInt(t) -> (MOVLload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t) && config.PtrSize == 8) -> (MOVQload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t) && config.PtrSize == 4) -> (MOVLload ptr mem)
 (Load <t> ptr mem) && is16BitInt(t) -> (MOVWload ptr mem)
 (Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) -> (MOVBload ptr mem)
 (Load <t> ptr mem) && is32BitFloat(t) -> (MOVSSload ptr mem)
@@ -302,39 +309,47 @@
 (Store [1] ptr val mem) -> (MOVBstore ptr val mem)

 // Lowering moves
-(Move [0] _ _ mem) -> mem
-(Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
-(Move [2] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
-(Move [4] dst src mem) -> (MOVLstore dst (MOVLload src mem) mem)
-(Move [8] dst src mem) -> (MOVQstore dst (MOVQload src mem) mem)
-(Move [16] dst src mem) -> (MOVOstore dst (MOVOload src mem) mem)
-(Move [3] dst src mem) ->
+(Move [s] _ _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 -> (MOVQstore dst (MOVQload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 -> (MOVOstore dst (MOVOload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
-(Move [5] dst src mem) ->
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 5 ->
 	(MOVBstore [4] dst (MOVBload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [6] dst src mem) ->
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 6 ->
 	(MOVWstore [4] dst (MOVWload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [7] dst src mem) ->
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 ->
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [size] dst src mem) && size > 8 && size < 16 ->
-	(MOVQstore [size-8] dst (MOVQload [size-8] src mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() < 16 ->
+	(MOVQstore [SizeAndAlign(s).Size()-8] dst (MOVQload [SizeAndAlign(s).Size()-8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

 // Adjust moves to be a multiple of 16 bytes.
-(Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 <= 8 ->
-	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size()%16 != 0 && SizeAndAlign(s).Size()%16 <= 8 ->
+	(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%16]
+		(OffPtr <dst.Type> dst [SizeAndAlign(s).Size()%16])
+		(OffPtr <src.Type> src [SizeAndAlign(s).Size()%16])
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 > 8 ->
-	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size()%16 != 0 && SizeAndAlign(s).Size()%16 > 8 ->
+	(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%16]
+		(OffPtr <dst.Type> dst [SizeAndAlign(s).Size()%16])
+		(OffPtr <src.Type> src [SizeAndAlign(s).Size()%16])
 		(MOVOstore dst (MOVOload src mem) mem))

 // Medium copying uses a duff device.
-(Move [size] dst src mem) && size >= 32 && size <= 16*64 && size%16 == 0 && !config.noDuffDevice ->
-	(DUFFCOPY [14*(64-size/16)] dst src mem)
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0
+	&& !config.noDuffDevice ->
+	(DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem)
 // 14 and 64 are magic constants.  14 is the number of bytes to encode:
 //	MOVUPS	(SI), X0
 //	ADDQ	$16, SI
@@ -343,57 +358,62 @@
 // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.

 // Large copying uses REP MOVSQ.
-(Move [size] dst src mem) && (size > 16*64 || config.noDuffDevice) && size%8 == 0 ->
-	(REPMOVSQ dst src (MOVQconst [size/8]) mem)
+(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 ->
+	(REPMOVSQ dst src (MOVQconst [SizeAndAlign(s).Size()/8]) mem)

 // Lowering Zero instructions
-(Zero [0] _ mem) -> mem
-(Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
-(Zero [2] destptr mem) -> (MOVWstoreconst [0] destptr mem)
-(Zero [4] destptr mem) -> (MOVLstoreconst [0] destptr mem)
-(Zero [8] destptr mem) -> (MOVQstoreconst [0] destptr mem)
+(Zero [s] _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstoreconst [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstoreconst [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstoreconst [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 8 -> (MOVQstoreconst [0] destptr mem)

-(Zero [3] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 3 ->
 	(MOVBstoreconst [makeValAndOff(0,2)] destptr
 		(MOVWstoreconst [0] destptr mem))
-(Zero [5] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 5 ->
 	(MOVBstoreconst [makeValAndOff(0,4)] destptr
 		(MOVLstoreconst [0] destptr mem))
-(Zero [6] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 6 ->
 	(MOVWstoreconst [makeValAndOff(0,4)] destptr
 		(MOVLstoreconst [0] destptr mem))
-(Zero [7] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 7 ->
 	(MOVLstoreconst [makeValAndOff(0,3)] destptr
 		(MOVLstoreconst [0] destptr mem))

 // Strip off any fractional word zeroing.
-(Zero [size] destptr mem) && size%8 != 0 && size > 8 ->
-	(Zero [size-size%8] (ADDQconst destptr [size%8])
+(Zero [s] destptr mem) && SizeAndAlign(s).Size()%8 != 0 && SizeAndAlign(s).Size() > 8 ->
+	(Zero [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%8] (OffPtr <destptr.Type> destptr [SizeAndAlign(s).Size()%8])
 		(MOVQstoreconst [0] destptr mem))

 // Zero small numbers of words directly.
-(Zero [16] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 16 ->
 	(MOVQstoreconst [makeValAndOff(0,8)] destptr
 		(MOVQstoreconst [0] destptr mem))
-(Zero [24] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 24 ->
 	(MOVQstoreconst [makeValAndOff(0,16)] destptr
 		(MOVQstoreconst [makeValAndOff(0,8)] destptr
 			(MOVQstoreconst [0] destptr mem)))
-(Zero [32] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 32 ->
 	(MOVQstoreconst [makeValAndOff(0,24)] destptr
 		(MOVQstoreconst [makeValAndOff(0,16)] destptr
 			(MOVQstoreconst [makeValAndOff(0,8)] destptr
 				(MOVQstoreconst [0] destptr mem))))

 // Medium zeroing uses a duff device.
-(Zero [size] destptr mem) && size <= 1024 && size%8 == 0 && size%16 != 0 && !config.noDuffDevice ->
-	(Zero [size-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
-(Zero [size] destptr mem) && size <= 1024 && size%16 == 0 && !config.noDuffDevice ->
-	(DUFFZERO [size] destptr (MOVOconst [0]) mem)
+(Zero [s] destptr mem)
+	&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%8 == 0 && SizeAndAlign(s).Size()%16 != 0
+	&& !config.noDuffDevice ->
+	(Zero [SizeAndAlign(s).Size()-8] (OffPtr <destptr.Type> [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
+(Zero [s] destptr mem)
+	&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice ->
+	(DUFFZERO [SizeAndAlign(s).Size()] destptr (MOVOconst [0]) mem)

 // Large zeroing uses REP STOSQ.
-(Zero [size] destptr mem) && (size > 1024 || (config.noDuffDevice && size > 32)) && size%8 == 0 ->
-	(REPSTOSQ destptr (MOVQconst [size/8]) (MOVQconst [0]) mem)
+(Zero [s] destptr mem)
+	&& (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32))
+	&& SizeAndAlign(s).Size()%8 == 0 ->
+	(REPSTOSQ destptr (MOVQconst [SizeAndAlign(s).Size()/8]) (MOVQconst [0]) mem)

 // Lowering constants
 (Const8   [val]) -> (MOVLconst [val])
@@ -402,7 +422,8 @@
 (Const64  [val]) -> (MOVQconst [val])
 (Const32F [val]) -> (MOVSSconst [val])
 (Const64F [val]) -> (MOVSDconst [val])
-(ConstNil) -> (MOVQconst [0])
+(ConstNil) && config.PtrSize == 8 -> (MOVQconst [0])
+(ConstNil) && config.PtrSize == 4 -> (MOVLconst [0])
 (ConstBool [b]) -> (MOVLconst [b])

 // Lowering calls
@@ -413,15 +434,17 @@
 (InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)

 // Miscellaneous
-(Convert <t> x mem) -> (MOVQconvert <t> x mem)
-(IsNonNil p) -> (SETNE (TESTQ p p))
+(Convert <t> x mem) && config.PtrSize == 8 -> (MOVQconvert <t> x mem)
+(Convert <t> x mem) && config.PtrSize == 4 -> (MOVLconvert <t> x mem)
+(IsNonNil p) && config.PtrSize == 8 -> (SETNE (TESTQ p p))
+(IsNonNil p) && config.PtrSize == 4 -> (SETNE (TESTL p p))
 (IsInBounds idx len) -> (SETB (CMPQ idx len))
 (IsSliceInBounds idx len) -> (SETBE (CMPQ idx len))
 (NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
 (GetG mem) -> (LoweredGetG mem)
 (GetClosurePtr) -> (LoweredGetClosurePtr)
-(Addr {sym} base) -> (LEAQ {sym} base)
-(ITab (Load ptr mem)) -> (MOVQload ptr mem)
+(Addr {sym} base) && config.PtrSize == 8 -> (LEAQ {sym} base)
+(Addr {sym} base) && config.PtrSize == 4 -> (LEAL {sym} base)

 // block rewrites
 (If (SETL  cmp) yes no) -> (LT  cmp yes no)
@@ -495,6 +518,12 @@
 (ANDLconst [c] (ANDLconst [d] x)) -> (ANDLconst [c & d] x)
 (ANDQconst [c] (ANDQconst [d] x)) -> (ANDQconst [c & d] x)

+(XORLconst [c] (XORLconst [d] x)) -> (XORLconst [c ^ d] x)
+(XORQconst [c] (XORQconst [d] x)) -> (XORQconst [c ^ d] x)
+
+(MULLconst [c] (MULLconst [d] x)) -> (MULLconst [int64(int32(c * d))] x)
+(MULQconst [c] (MULQconst [d] x)) && is32Bit(c*d) -> (MULQconst [c * d] x)
+
 (ORQ x (MOVQconst [c])) && is32Bit(c) -> (ORQconst [c] x)
 (ORQ (MOVQconst [c]) x) && is32Bit(c) -> (ORQconst [c] x)
 (ORL x (MOVLconst [c])) -> (ORLconst [c] x)
@@ -544,6 +573,16 @@
 (SHRL x (ANDLconst [31] y)) -> (SHRL x y)
 (SHRQ x (ANDQconst [63] y)) -> (SHRQ x y)

+(ROLQconst [c] (ROLQconst [d] x)) -> (ROLQconst [(c+d)&63] x)
+(ROLLconst [c] (ROLLconst [d] x)) -> (ROLLconst [(c+d)&31] x)
+(ROLWconst [c] (ROLWconst [d] x)) -> (ROLWconst [(c+d)&15] x)
+(ROLBconst [c] (ROLBconst [d] x)) -> (ROLBconst [(c+d)& 7] x)
+
+(ROLQconst [0] x) -> x
+(ROLLconst [0] x) -> x
+(ROLWconst [0] x) -> x
+(ROLBconst [0] x) -> x
+
 // Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
 // because the x86 instructions are defined to use all 5 bits of the shift even
 // for the small shifts. I don't think we'll ever generate a weird shift (e.g.
@@ -1564,3 +1603,53 @@
  && x.Uses == 1
  && clobber(x)
  -> (MOVQstoreidx1 [i-4] {s} p (SHLQconst <idx.Type> [2] idx) w0 mem)
+
+// amd64p32 rules
+// same as the rules above, but with 32 instead of 64 bit pointer arithmetic.
+// LEAQ,ADDQ -> LEAL,ADDL
+(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
+(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
+
+(MOVQload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVQload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVLload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVLload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVQstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVQstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVLstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVLstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVWstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVBstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(MOVQstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVQstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVWstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVWstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVBstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVBstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+(MOVQload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVQload  [off1+off2] {sym} ptr mem)
+(MOVLload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVLload  [off1+off2] {sym} ptr mem)
+(MOVWload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVWload  [off1+off2] {sym} ptr mem)
+(MOVBload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload  [off1+off2] {sym} ptr mem)
+(MOVQstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVQstore  [off1+off2] {sym} ptr val mem)
+(MOVLstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore  [off1+off2] {sym} ptr val mem)
+(MOVWstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore  [off1+off2] {sym} ptr val mem)
+(MOVBstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore  [off1+off2] {sym} ptr val mem)
+(MOVQstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVQstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOVLstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVLstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOVWstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVWstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOVBstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVBstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -64,7 +64,6 @@ var regNamesAMD64 = []string{

 	// pseudo-registers
 	"SB",
-	"FLAGS",
 }

 func init() {
@@ -98,43 +97,36 @@ func init() {
 		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15")
 		gpsp       = gp | buildReg("SP")
 		gpspsb     = gpsp | buildReg("SB")
-		flags      = buildReg("FLAGS")
-		callerSave = gp | fp | flags
+		callerSave = gp | fp
 	)
 	// Common slices of register masks
 	var (
-		gponly    = []regMask{gp}
-		fponly    = []regMask{fp}
-		flagsonly = []regMask{flags}
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
 	)

 	// Common regInfo
 	var (
-		gp01      = regInfo{inputs: []regMask{}, outputs: gponly}
-		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly, clobbers: flags}
-		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly, clobbers: flags}
-		gp11nf    = regInfo{inputs: []regMask{gpsp}, outputs: gponly} // nf: no flags clobbered
+		gp01      = regInfo{inputs: nil, outputs: gponly}
+		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
 		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
-		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly, clobbers: flags}
-		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly, clobbers: flags}
+		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
 		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
-		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}, clobbers: flags}
-		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax},
-			clobbers: dx | flags}
-		gp11hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx},
-			clobbers: ax | flags}
-		gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx},
-			clobbers: ax | flags}
+		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
+		gp21hmul  = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}

-		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: flagsonly}
-		gp1flags = regInfo{inputs: []regMask{gpsp}, outputs: flagsonly}
-		flagsgp  = regInfo{inputs: flagsonly, outputs: gponly}
+		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags = regInfo{inputs: []regMask{gpsp}}
+		flagsgp  = regInfo{inputs: nil, outputs: gponly}

 		// for CMOVconst -- uses AX to hold constant temporary.
-		gp1flagsgp = regInfo{inputs: []regMask{gp &^ ax, flags}, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
+		gp1flagsgp = regInfo{inputs: []regMask{gp &^ ax}, clobbers: ax, outputs: []regMask{gp &^ ax}}

-		readflags = regInfo{inputs: flagsonly, outputs: gponly}
-		flagsgpax = regInfo{inputs: flagsonly, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
+		readflags = regInfo{inputs: nil, outputs: gponly}
+		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}

 		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
 		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
@@ -144,14 +136,14 @@ func init() {
 		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
 		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}

-		fp01    = regInfo{inputs: []regMask{}, outputs: fponly}
+		fp01    = regInfo{inputs: nil, outputs: fponly}
 		fp21    = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
 		fp21x15 = regInfo{inputs: []regMask{fp &^ x15, fp &^ x15},
 			clobbers: x15, outputs: []regMask{fp &^ x15}}
 		fpgp     = regInfo{inputs: fponly, outputs: gponly}
 		gpfp     = regInfo{inputs: gponly, outputs: fponly}
 		fp11     = regInfo{inputs: fponly, outputs: fponly}
-		fp2flags = regInfo{inputs: []regMask{fp, fp}, outputs: flagsonly}
+		fp2flags = regInfo{inputs: []regMask{fp, fp}}

 		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
 		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
@@ -188,60 +180,53 @@ func init() {
 		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by 8i store

 		// binary ops
-		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true},                // arg0 + arg1
-		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true},                // arg0 + arg1
-		{name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int64", typ: "UInt64"}, // arg0 + auxint
-		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32"},                // arg0 + auxint
+		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int64", typ: "UInt64", clobberFlags: true}, // arg0 + auxint
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", clobberFlags: true},                // arg0 + auxint

-		{name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true},                    // arg0 - arg1
-		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true},                    // arg0 - arg1
-		{name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int64", resultInArg0: true}, // arg0 - auxint
-		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0 - auxint
+		{name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint

-		{name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true}, // arg0 * arg1
-		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true}, // arg0 * arg1
-		{name: "MULQconst", argLength: 1, reg: gp11, asm: "IMULQ", aux: "Int64", resultInArg0: true}, // arg0 * auxint
-		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true}, // arg0 * auxint
+		{name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULQconst", argLength: 1, reg: gp11, asm: "IMULQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 * auxint
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 * auxint

-		{name: "HMULQ", argLength: 2, reg: gp11hmul, asm: "IMULQ"}, // (arg0 * arg1) >> width
-		{name: "HMULL", argLength: 2, reg: gp11hmul, asm: "IMULL"}, // (arg0 * arg1) >> width
-		{name: "HMULW", argLength: 2, reg: gp11hmul, asm: "IMULW"}, // (arg0 * arg1) >> width
-		{name: "HMULB", argLength: 2, reg: gp11hmul, asm: "IMULB"}, // (arg0 * arg1) >> width
-		{name: "HMULQU", argLength: 2, reg: gp11hmul, asm: "MULQ"}, // (arg0 * arg1) >> width
-		{name: "HMULLU", argLength: 2, reg: gp11hmul, asm: "MULL"}, // (arg0 * arg1) >> width
-		{name: "HMULWU", argLength: 2, reg: gp11hmul, asm: "MULW"}, // (arg0 * arg1) >> width
-		{name: "HMULBU", argLength: 2, reg: gp11hmul, asm: "MULB"}, // (arg0 * arg1) >> width
+		{name: "HMULQ", argLength: 2, reg: gp21hmul, asm: "IMULQ", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULW", argLength: 2, reg: gp21hmul, asm: "IMULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULB", argLength: 2, reg: gp21hmul, asm: "IMULB", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULQU", argLength: 2, reg: gp21hmul, asm: "MULQ", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULWU", argLength: 2, reg: gp21hmul, asm: "MULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULBU", argLength: 2, reg: gp21hmul, asm: "MULB", clobberFlags: true}, // (arg0 * arg1) >> width

-		{name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true}, // (arg0 + arg1) / 2 as unsigned, all 64 result bits
+		{name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 64 result bits

-		{name: "DIVQ", argLength: 2, reg: gp11div, asm: "IDIVQ"}, // arg0 / arg1
-		{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL"}, // arg0 / arg1
-		{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW"}, // arg0 / arg1
-		{name: "DIVQU", argLength: 2, reg: gp11div, asm: "DIVQ"}, // arg0 / arg1
-		{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL"}, // arg0 / arg1
-		{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW"}, // arg0 / arg1
+		{name: "DIVQ", argLength: 2, reg: gp11div, typ: "(Int64,Int64)", asm: "IDIVQ", clobberFlags: true},   // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVL", argLength: 2, reg: gp11div, typ: "(Int32,Int32)", asm: "IDIVL", clobberFlags: true},   // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVW", argLength: 2, reg: gp11div, typ: "(Int16,Int16)", asm: "IDIVW", clobberFlags: true},   // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVQU", argLength: 2, reg: gp11div, typ: "(UInt64,UInt64)", asm: "DIVQ", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVLU", argLength: 2, reg: gp11div, typ: "(UInt32,UInt32)", asm: "DIVL", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVWU", argLength: 2, reg: gp11div, typ: "(UInt16,UInt16)", asm: "DIVW", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]

-		{name: "MODQ", argLength: 2, reg: gp11mod, asm: "IDIVQ"}, // arg0 % arg1
-		{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL"}, // arg0 % arg1
-		{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW"}, // arg0 % arg1
-		{name: "MODQU", argLength: 2, reg: gp11mod, asm: "DIVQ"}, // arg0 % arg1
-		{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL"}, // arg0 % arg1
-		{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW"}, // arg0 % arg1
+		{name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint

-		{name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true}, // arg0 & arg1
-		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true}, // arg0 & arg1
-		{name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int64", resultInArg0: true}, // arg0 & auxint
-		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true}, // arg0 & auxint
+		{name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint

-		{name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true}, // arg0 | arg1
-		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true}, // arg0 | arg1
-		{name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int64", resultInArg0: true}, // arg0 | auxint
-		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true}, // arg0 | auxint
-
-		{name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true}, // arg0 ^ arg1
-		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true}, // arg0 ^ arg1
-		{name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int64", resultInArg0: true}, // arg0 ^ auxint
-		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true}, // arg0 ^ auxint
+		{name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint

 		{name: "CMPQ", argLength: 2, reg: gp2flags, asm: "CMPQ", typ: "Flags"},                    // arg0 compare to arg1
 		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
@@ -264,60 +249,60 @@ func init() {
 		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
 		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0

-		{name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true},               // arg0 << arg1, shift amount is mod 64
-		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true},               // arg0 << arg1, shift amount is mod 32
-		{name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int64", resultInArg0: true}, // arg0 << auxint, shift amount 0-63
-		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true}, // arg0 << auxint, shift amount 0-31
+		{name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 64
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 32
+		{name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-63
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
 		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!

-		{name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 64
-		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
-		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
-		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
-		{name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int64", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-63
-		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-31
-		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-31
-		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true},  // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 64
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-63
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // unsigned arg0 >> auxint, shift amount 0-31

-		{name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 64
-		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
-		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
-		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
-		{name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int64", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-63
-		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-31
-		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-31
-		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true},  // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 64
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // signed arg0 >> auxint, shift amount 0-31

-		{name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int64", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-63
-		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-31
-		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-15
-		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true},  // arg0 rotate left auxint, rotate amount 0-7
+		{name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-63
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // arg0 rotate left auxint, rotate amount 0-7

 		// unary ops
-		{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true}, // -arg0
-		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true}, // -arg0
+		{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true}, // -arg0
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0

-		{name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true}, // ^arg0
-		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
+		{name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true, clobberFlags: true}, // ^arg0
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true, clobberFlags: true}, // ^arg0

-		{name: "BSFQ", argLength: 1, reg: gp11, asm: "BSFQ"}, // arg0 # of low-order zeroes ; undef if zero
-		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL"}, // arg0 # of low-order zeroes ; undef if zero
-		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW"}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFQ", argLength: 1, reg: gp11, asm: "BSFQ", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero

-		{name: "BSRQ", argLength: 1, reg: gp11, asm: "BSRQ"}, // arg0 # of high-order zeroes ; undef if zero
-		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL"}, // arg0 # of high-order zeroes ; undef if zero
-		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW"}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRQ", argLength: 1, reg: gp11, asm: "BSRQ", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero

 		// Note ASM for ops moves whole register
-		{name: "CMOVQEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQEQ", typ: "UInt64", aux: "Int64", resultInArg0: true}, // replace arg0 w/ constant if Z set
-		{name: "CMOVLEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt32", aux: "Int32", resultInArg0: true}, // replace arg0 w/ constant if Z set
-		{name: "CMOVWEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt16", aux: "Int16", resultInArg0: true}, // replace arg0 w/ constant if Z set
-		{name: "CMOVQNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQNE", typ: "UInt64", aux: "Int64", resultInArg0: true}, // replace arg0 w/ constant if Z not set
-		{name: "CMOVLNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt32", aux: "Int32", resultInArg0: true}, // replace arg0 w/ constant if Z not set
-		{name: "CMOVWNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt16", aux: "Int16", resultInArg0: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVQEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQEQ", typ: "UInt64", aux: "Int64", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVLEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt32", aux: "Int32", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVWEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt16", aux: "Int16", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVQNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQNE", typ: "UInt64", aux: "Int64", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVLNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt32", aux: "Int32", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVWNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt16", aux: "Int16", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z not set

-		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true}, // arg0 swap bytes
-		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes
+		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes

 		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)

@@ -338,20 +323,20 @@ func init() {
 		// Need different opcodes for floating point conditions because
 		// any comparison involving a NaN is always FALSE and thus
 		// the patterns for inverting conditions cannot be used.
-		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ"}, // extract == condition from arg0
-		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE"}, // extract != condition from arg0
-		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},   // extract "ordered" (No Nan present) condition from arg0
-		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},   // extract "unordered" (Nan present) condition from arg0
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},                       // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},                       // extract "unordered" (Nan present) condition from arg0

 		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
 		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0

-		{name: "MOVBQSX", argLength: 1, reg: gp11nf, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64
-		{name: "MOVBQZX", argLength: 1, reg: gp11nf, asm: "MOVBQZX"}, // zero extend arg0 from int8 to int64
-		{name: "MOVWQSX", argLength: 1, reg: gp11nf, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64
-		{name: "MOVWQZX", argLength: 1, reg: gp11nf, asm: "MOVWQZX"}, // zero extend arg0 from int16 to int64
-		{name: "MOVLQSX", argLength: 1, reg: gp11nf, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64
-		{name: "MOVLQZX", argLength: 1, reg: gp11nf, asm: "MOVLQZX"}, // zero extend arg0 from int32 to int64
+		{name: "MOVBQSX", argLength: 1, reg: gp11, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64
+		{name: "MOVBQZX", argLength: 1, reg: gp11, asm: "MOVBQZX"}, // zero extend arg0 from int8 to int64
+		{name: "MOVWQSX", argLength: 1, reg: gp11, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64
+		{name: "MOVWQZX", argLength: 1, reg: gp11, asm: "MOVWQZX"}, // zero extend arg0 from int16 to int64
+		{name: "MOVLQSX", argLength: 1, reg: gp11, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64
+		{name: "MOVLQZX", argLength: 1, reg: gp11, asm: "MOVLQZX"}, // zero extend arg0 from int32 to int64

 		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
 		{name: "MOVQconst", reg: gp01, asm: "MOVQ", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
@@ -369,13 +354,15 @@ func init() {

 		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.

-		{name: "LEAQ", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
-		{name: "LEAQ1", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + arg1 + auxint + aux
-		{name: "LEAQ2", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 2*arg1 + auxint + aux
-		{name: "LEAQ4", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 4*arg1 + auxint + aux
-		{name: "LEAQ8", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 8*arg1 + auxint + aux
+		{name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+		{name: "LEAQ1", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + arg1 + auxint + aux
+		{name: "LEAQ2", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAQ4", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAQ8", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + 8*arg1 + auxint + aux
 		// Note: LEAQ{1,2,4,8} must not have OpSB as either argument.

+		{name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+
 		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
 		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
 		{name: "MOVBQSXload", argLength: 2, reg: gpload, asm: "MOVBQSX", aux: "SymOff"},             // ditto, sign extend to int64
@@ -436,8 +423,9 @@ func init() {
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("DI"), buildReg("X0")},
-				clobbers: buildReg("DI FLAGS"),
+				clobbers: buildReg("DI"),
 			},
+			clobberFlags: true,
 		},
 		{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},

@@ -455,11 +443,11 @@ func init() {
 			},
 		},

-		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff"},                                // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
-		{name: "CALLclosure", argLength: 3, reg: regInfo{[]regMask{gpsp, buildReg("DX"), 0}, callerSave, nil}, aux: "Int64"}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
-		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64"},                                  // call deferproc.  arg0=mem, auxint=argsize, returns mem
-		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64"},                                     // call newproc.  arg0=mem, auxint=argsize, returns mem
-		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64"},           // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                             // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                               // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                  // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem

 		// arg0 = destination pointer
 		// arg1 = source pointer
@@ -472,8 +460,9 @@ func init() {
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
-				clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary
+				clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
 			},
+			clobberFlags: true,
 		},

 		// arg0 = destination pointer
@@ -504,14 +493,15 @@ func init() {
 		// use of DX (the closure pointer)
 		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}},
 		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
-		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}, clobbers: flags}},
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true},

 		// MOVQconvert converts between pointers and integers.
 		// We have a special op for this so as to not confuse GC
 		// (particularly stack maps).  It takes a memory arg so it
 		// gets correctly ordered with respect to GC safepoints.
 		// arg0=ptr/int arg1=mem, output=int/ptr
-		{name: "MOVQconvert", argLength: 2, reg: gp11nf, asm: "MOVQ"},
+		{name: "MOVQconvert", argLength: 2, reg: gp11, asm: "MOVQ"},
+		{name: "MOVLconvert", argLength: 2, reg: gp11, asm: "MOVL"}, // amd64p32 equivalent

 		// Constant flag values. For any comparison, there are 5 possible
 		// outcomes: the three from the signed total order (<,==,>) and the
@@ -545,11 +535,14 @@ func init() {
 	}

 	archs = append(archs, arch{
-		name:     "AMD64",
-		pkg:      "cmd/internal/obj/x86",
-		genfile:  "../../amd64/ssa.go",
-		ops:      AMD64ops,
-		blocks:   AMD64blocks,
-		regnames: regNamesAMD64,
+		name:            "AMD64",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../amd64/ssa.go",
+		ops:             AMD64ops,
+		blocks:          AMD64blocks,
+		regnames:        regNamesAMD64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["BP"]),
 	})
 }
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@@ -0,0 +1,455 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - *const instructions may use a constant larger than the instuction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R27).
+
+// Suffixes encode the bit width of various instructions.
+// D (double word) = 64 bit
+// W (word)        = 32 bit
+// H (half word)   = 16 bit
+// HU              = 16 bit unsigned
+// B (byte)        = 8 bit
+// BU              = 8 bit unsigned
+// S (single)      = 32 bit float
+// D (double)      = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesARM64 = []string{
+	"R0",
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"R16",
+	"R17",
+	"R18", // platform register, not used
+	"R19",
+	"R20",
+	"R21",
+	"R22",
+	"R23",
+	"R24",
+	"R25",
+	"R26",
+	// R27 = REGTMP not used in regalloc
+	"g",   // aka R28
+	"R29", // frame pointer, not used
+	// R30 = REGLINK not used in regalloc
+	"SP", // aka R31
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+	"F16",
+	"F17",
+	"F18",
+	"F19",
+	"F20",
+	"F21",
+	"F22",
+	"F23",
+	"F24",
+	"F25",
+	"F26",
+	"F27",
+	"F28", // 0.0
+	"F29", // 0.5
+	"F30", // 1.0
+	"F31", // 2.0
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesARM64) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesARM64 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		gp         = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26")
+		gpg        = gp | buildReg("g")
+		gpsp       = gp | buildReg("SP")
+		gpspg      = gpg | buildReg("SP")
+		gpspsbg    = gpspg | buildReg("SB")
+		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27")
+		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+	)
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11      = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11sp    = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+		gp1flags  = regInfo{inputs: []regMask{gpg}}
+		gp1flags1 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp21      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+		gp2flags  = regInfo{inputs: []regMask{gpg, gpg}}
+		gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+		//gp22      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}}
+		//gp31      = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		//gp3flags  = regInfo{inputs: []regMask{gp, gp, gp}}
+		//gp3flags1 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		gpload   = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+		gpstore  = regInfo{inputs: []regMask{gpspsbg, gpg}}
+		gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+		//gp2load   = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		//gp2store  = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
+		fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		//fp1flags  = regInfo{inputs: []regMask{fp}}
+		fpgp      = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+		gpfp      = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+		fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp2flags  = regInfo{inputs: []regMask{fp, fp}}
+		fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+		fpstore   = regInfo{inputs: []regMask{gpspsbg, fp}}
+		readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+	)
+	ops := []opData{
+		// binary ops
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},     // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int64"},   // arg0 + auxInt
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                        // arg0 - arg1
+		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int64"},     // arg0 - auxInt
+		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true},     // arg0 * arg1
+		{name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true},   // arg0 * arg1, 32-bit
+		{name: "MULH", argLength: 2, reg: gp21, asm: "SMULH", commutative: true},  // (arg0 * arg1) >> 64, signed
+		{name: "UMULH", argLength: 2, reg: gp21, asm: "UMULH", commutative: true}, // (arg0 * arg1) >> 64, unsigned
+		{name: "MULL", argLength: 2, reg: gp21, asm: "SMULL", commutative: true},  // arg0 * arg1, signed, 32-bit mult results in 64-bit
+		{name: "UMULL", argLength: 2, reg: gp21, asm: "UMULL", commutative: true}, // arg0 * arg1, unsigned, 32-bit mult results in 64-bit
+		{name: "DIV", argLength: 2, reg: gp21, asm: "SDIV"},                       // arg0 / arg1, signed
+		{name: "UDIV", argLength: 2, reg: gp21, asm: "UDIV"},                      // arg0 / arg1, unsighed
+		{name: "DIVW", argLength: 2, reg: gp21, asm: "SDIVW"},                     // arg0 / arg1, signed, 32 bit
+		{name: "UDIVW", argLength: 2, reg: gp21, asm: "UDIVW"},                    // arg0 / arg1, unsighed, 32 bit
+		{name: "MOD", argLength: 2, reg: gp21, asm: "REM"},                        // arg0 % arg1, signed
+		{name: "UMOD", argLength: 2, reg: gp21, asm: "UREM"},                      // arg0 % arg1, unsigned
+		{name: "MODW", argLength: 2, reg: gp21, asm: "REMW"},                      // arg0 % arg1, signed, 32 bit
+		{name: "UMODW", argLength: 2, reg: gp21, asm: "UREMW"},                    // arg0 % arg1, unsigned, 32 bit
+
+		{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1
+		{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1
+		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"},                    // arg0 - arg1
+		{name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"},                    // arg0 - arg1
+		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1
+		{name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"},                    // arg0 / arg1
+		{name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"},                    // arg0 / arg1
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt
+		{name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true},  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int64"},  // arg0 | auxInt
+		{name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int64"}, // arg0 ^ auxInt
+		{name: "BIC", argLength: 2, reg: gp21, asm: "BIC"},                    // arg0 &^ arg1
+		{name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int64"}, // arg0 &^ auxInt
+
+		// unary ops
+		{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"},       // ^arg0
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},       // -arg0
+		{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"},   // -arg0, float32
+		{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"},   // -arg0, float64
+		{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64
+
+		// shifts
+		{name: "SLL", argLength: 2, reg: gp21, asm: "LSL"},                      // arg0 << arg1, shift amount is mod 64
+		{name: "SLLconst", argLength: 1, reg: gp11, asm: "LSL", aux: "Int64"},   // arg0 << auxInt
+		{name: "SRL", argLength: 2, reg: gp21, asm: "LSR"},                      // arg0 >> arg1, unsigned, shift amount is mod 64
+		{name: "SRLconst", argLength: 1, reg: gp11, asm: "LSR", aux: "Int64"},   // arg0 >> auxInt, unsigned
+		{name: "SRA", argLength: 2, reg: gp21, asm: "ASR"},                      // arg0 >> arg1, signed, shift amount is mod 64
+		{name: "SRAconst", argLength: 1, reg: gp11, asm: "ASR", aux: "Int64"},   // arg0 >> auxInt, signed
+		{name: "RORconst", argLength: 1, reg: gp11, asm: "ROR", aux: "Int64"},   // arg0 right rotate by auxInt bits
+		{name: "RORWconst", argLength: 1, reg: gp11, asm: "RORW", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits
+
+		// comparisons
+		{name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"},                      // arg0 compare to arg1
+		{name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int64", typ: "Flags"},   // arg0 compare to auxInt
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1, 32 bit
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt, 32 bit
+		{name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags"},                      // arg0 compare to -arg1
+		{name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int64", typ: "Flags"},   // arg0 compare to -auxInt
+		{name: "CMNW", argLength: 2, reg: gp2flags, asm: "CMNW", typ: "Flags"},                    // arg0 compare to -arg1, 32 bit
+		{name: "CMNWconst", argLength: 1, reg: gp1flags, asm: "CMNW", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt, 32 bit
+		{name: "FCMPS", argLength: 2, reg: fp2flags, asm: "FCMPS", typ: "Flags"},                  // arg0 compare to arg1, float32
+		{name: "FCMPD", argLength: 2, reg: fp2flags, asm: "FCMPD", typ: "Flags"},                  // arg0 compare to arg1, float64
+
+		// shifted ops
+		{name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"},                   // arg0 + arg1<<auxInt
+		{name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"},                   // arg0 + arg1>>auxInt, unsigned shift
+		{name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"},                   // arg0 + arg1>>auxInt, signed shift
+		{name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"},                   // arg0 - arg1<<auxInt
+		{name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"},                   // arg0 - arg1>>auxInt, unsigned shift
+		{name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"},                   // arg0 - arg1>>auxInt, signed shift
+		{name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"},                   // arg0 & (arg1<<auxInt)
+		{name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"},                   // arg0 & (arg1>>auxInt), unsigned shift
+		{name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"},                   // arg0 & (arg1>>auxInt), signed shift
+		{name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"},                    // arg0 | arg1<<auxInt
+		{name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"},                    // arg0 | arg1>>auxInt, unsigned shift
+		{name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"},                    // arg0 | arg1>>auxInt, signed shift
+		{name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"},                   // arg0 ^ arg1<<auxInt
+		{name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"},                   // arg0 ^ arg1>>auxInt, unsigned shift
+		{name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"},                   // arg0 ^ arg1>>auxInt, signed shift
+		{name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"},                   // arg0 &^ (arg1<<auxInt)
+		{name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"},                   // arg0 &^ (arg1>>auxInt), unsigned shift
+		{name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"},                   // arg0 &^ (arg1>>auxInt), signed shift
+		{name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1<<auxInt
+		{name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift
+		{name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift
+
+		// moves
+		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "UInt64", rematerializeable: true},      // 32 low bits of auxint
+		{name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVS", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8"},      // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8"},   // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVD", typ: "UInt64"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "FMOVSload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVS", typ: "Float32"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "FMOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVD", typ: "Float64"}, // load from arg0 + auxInt + aux.  arg1=mem.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem"},   // store 1 byte of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem"},   // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem"},   // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVDstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVD", typ: "Mem"},   // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "FMOVSstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVS", typ: "Mem"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVD", typ: "Mem"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+
+		{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem"}, // store 1 byte of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + auxInt + aux.  ar12=mem.
+
+		// conversions
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"},   // move from arg0, sign-extended from half
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0, sign-extended from word
+		{name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+		{name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOVD"},   // move from arg0
+
+		{name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		{name: "SCVTFWS", argLength: 1, reg: gpfp, asm: "SCVTFWS"},   // int32 -> float32
+		{name: "SCVTFWD", argLength: 1, reg: gpfp, asm: "SCVTFWD"},   // int32 -> float64
+		{name: "UCVTFWS", argLength: 1, reg: gpfp, asm: "UCVTFWS"},   // uint32 -> float32
+		{name: "UCVTFWD", argLength: 1, reg: gpfp, asm: "UCVTFWD"},   // uint32 -> float64
+		{name: "SCVTFS", argLength: 1, reg: gpfp, asm: "SCVTFS"},     // int64 -> float32
+		{name: "SCVTFD", argLength: 1, reg: gpfp, asm: "SCVTFD"},     // int64 -> float64
+		{name: "UCVTFS", argLength: 1, reg: gpfp, asm: "UCVTFS"},     // uint64 -> float32
+		{name: "UCVTFD", argLength: 1, reg: gpfp, asm: "UCVTFD"},     // uint64 -> float64
+		{name: "FCVTZSSW", argLength: 1, reg: fpgp, asm: "FCVTZSSW"}, // float32 -> int32
+		{name: "FCVTZSDW", argLength: 1, reg: fpgp, asm: "FCVTZSDW"}, // float64 -> int32
+		{name: "FCVTZUSW", argLength: 1, reg: fpgp, asm: "FCVTZUSW"}, // float32 -> uint32
+		{name: "FCVTZUDW", argLength: 1, reg: fpgp, asm: "FCVTZUDW"}, // float64 -> uint32
+		{name: "FCVTZSS", argLength: 1, reg: fpgp, asm: "FCVTZSS"},   // float32 -> int64
+		{name: "FCVTZSD", argLength: 1, reg: fpgp, asm: "FCVTZSD"},   // float64 -> int64
+		{name: "FCVTZUS", argLength: 1, reg: fpgp, asm: "FCVTZUS"},   // float32 -> uint64
+		{name: "FCVTZUD", argLength: 1, reg: fpgp, asm: "FCVTZUD"},   // float64 -> uint64
+		{name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD"},     // float32 -> float64
+		{name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS"},     // float64 -> float32
+
+		// conditional instructions
+		{name: "CSELULT", argLength: 3, reg: gp2flags1, asm: "CSEL"},  // returns arg0 if flags indicates unsigned LT, arg1 otherwise, arg2=flags
+		{name: "CSELULT0", argLength: 2, reg: gp1flags1, asm: "CSEL"}, // returns arg0 if flags indicates unsigned LT, 0 otherwise, arg1=flags
+
+		// function calls
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                              // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R26"), 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                   // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                         // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// pseudo-ops
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}}, // panic if arg0 is nil.  arg1=mem.
+
+		{name: "Equal", argLength: 1, reg: readflags},         // bool, true flags encode x==y false otherwise.
+		{name: "NotEqual", argLength: 1, reg: readflags},      // bool, true flags encode x!=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: readflags},      // bool, true flags encode signed x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: readflags},     // bool, true flags encode signed x<=y false otherwise.
+		{name: "GreaterThan", argLength: 1, reg: readflags},   // bool, true flags encode signed x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: readflags},  // bool, true flags encode signed x>=y false otherwise.
+		{name: "LessThanU", argLength: 1, reg: readflags},     // bool, true flags encode unsigned x<y false otherwise.
+		{name: "LessEqualU", argLength: 1, reg: readflags},    // bool, true flags encode unsigned x<=y false otherwise.
+		{name: "GreaterThanU", argLength: 1, reg: readflags},  // bool, true flags encode unsigned x>y false otherwise.
+		{name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise.
+
+		// duffzero
+		// arg0 = address of memory to zero
+		// arg1 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		// R16 aka arm64.REGRT1 changed as side effect
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{gp},
+				clobbers: buildReg("R16"),
+			},
+		},
+
+		// large zeroing
+		// arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
+		// arg1 = address of the last element to zero
+		// arg2 = mem
+		// auxint = alignment
+		// returns mem
+		//	MOVD.P	ZR, 8(R16)
+		//	CMP	Rarg1, R16
+		//	BLE	-2(PC)
+		// Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
+		// the-end-of-the-memory - 8 is with the area to zero, ok to spill.
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R16"), gp},
+				clobbers: buildReg("R16"),
+			},
+			clobberFlags: true,
+		},
+
+		// large move
+		// arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
+		// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// auxint = alignment
+		// returns mem
+		//	MOVD.P	8(R16), Rtmp
+		//	MOVD.P	Rtmp, 8(R17)
+		//	CMP	Rarg2, R16
+		//	BLE	-3(PC)
+		// Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
+		// the-end-of-src - 8 is within the area to copy, ok to spill.
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R17"), buildReg("R16"), gp},
+				clobbers: buildReg("R16 R17"),
+			},
+			clobberFlags: true,
+		},
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R26 (arm64.REGCTXT, the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R26")}}},
+
+		// MOVDconvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GC
+		// (particularly stack maps).  It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		// arg0=ptr/int arg1=mem, output=int/ptr
+		{name: "MOVDconvert", argLength: 2, reg: gp11, asm: "MOVD"},
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned <
+		{name: "FlagGT_ULT"}, // signed > and unsigned >
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+	}
+
+	blocks := []blockData{
+		{name: "EQ"},
+		{name: "NE"},
+		{name: "LT"},
+		{name: "LE"},
+		{name: "GT"},
+		{name: "GE"},
+		{name: "ULT"},
+		{name: "ULE"},
+		{name: "UGT"},
+		{name: "UGE"},
+	}
+
+	archs = append(archs, arch{
+		name:            "ARM64",
+		pkg:             "cmd/internal/obj/arm64",
+		genfile:         "../../arm64/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesARM64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: -1, // not used
+	})
+}
--- a/src/cmd/compile/internal/ssa/gen/ARMOps.go
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
@@ -6,32 +6,481 @@

 package main

+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - *const instructions may use a constant larger than the instuction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R11).
+
+// Suffixes encode the bit width of various instructions.
+// W (word)      = 32 bit
+// H (half word) = 16 bit
+// HU            = 16 bit unsigned
+// B (byte)      = 8 bit
+// BU            = 8 bit unsigned
+// F (float)     = 32 bit float
+// D (double)    = 64 bit float
+
+var regNamesARM = []string{
+	"R0",
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"g",   // aka R10
+	"R11", // tmp
+	"R12",
+	"SP",  // aka R13
+	"R14", // link
+	"R15", // pc
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15", // tmp
+
+	// pseudo-registers
+	"SB",
+}
+
 func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesARM) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesARM {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
 	var (
-		gp01       = regInfo{inputs: []regMask{}, outputs: []regMask{31}}
-		gp11       = regInfo{inputs: []regMask{31}, outputs: []regMask{31}}
-		gp21       = regInfo{inputs: []regMask{31, 31}, outputs: []regMask{31}}
-		gp2flags   = regInfo{inputs: []regMask{31, 31}, outputs: []regMask{32}}
-		gpload     = regInfo{inputs: []regMask{31}, outputs: []regMask{31}}
-		gpstore    = regInfo{inputs: []regMask{31, 31}, outputs: []regMask{}}
-		flagsgp    = regInfo{inputs: []regMask{32}, outputs: []regMask{31}}
-		callerSave = regMask(15)
+		gp         = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12")
+		gpg        = gp | buildReg("g")
+		gpsp       = gp | buildReg("SP")
+		gpspg      = gpg | buildReg("SP")
+		gpspsbg    = gpspg | buildReg("SB")
+		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+	)
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11      = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11carry = regInfo{inputs: []regMask{gpg}, outputs: []regMask{0, gp}}
+		gp11sp    = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+		gp1flags  = regInfo{inputs: []regMask{gpg}}
+		gp1flags1 = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}
+		gp21      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+		gp21carry = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{0, gp}}
+		gp2flags  = regInfo{inputs: []regMask{gpg, gpg}}
+		gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+		gp22      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}}
+		gp31      = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		gp31carry = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{0, gp}}
+		gp3flags  = regInfo{inputs: []regMask{gp, gp, gp}}
+		gp3flags1 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		gpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+		gpstore   = regInfo{inputs: []regMask{gpspsbg, gpg}}
+		gp2load   = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		gp2store  = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
+		fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11      = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		fp1flags  = regInfo{inputs: []regMask{fp}}
+		fpgp      = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+		gpfp      = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+		fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp2flags  = regInfo{inputs: []regMask{fp, fp}}
+		fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+		fpstore   = regInfo{inputs: []regMask{gpspsbg, fp}}
+		readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
 	)
 	ops := []opData{
-		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},  // arg0 + arg1
-		{name: "ADDconst", argLength: 1, reg: gp11, asm: "ADD", aux: "SymOff"}, // arg0 + auxInt + aux.(*gc.Sym)
+		// binary ops
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},     // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32"},   // arg0 + auxInt
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                        // arg0 - arg1
+		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32"},     // arg0 - auxInt
+		{name: "RSB", argLength: 2, reg: gp21, asm: "RSB"},                        // arg1 - arg0
+		{name: "RSBconst", argLength: 1, reg: gp11, asm: "RSB", aux: "Int32"},     // auxInt - arg0
+		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true},     // arg0 * arg1
+		{name: "HMUL", argLength: 2, reg: gp21, asm: "MULL", commutative: true},   // (arg0 * arg1) >> 32, signed
+		{name: "HMULU", argLength: 2, reg: gp21, asm: "MULLU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+		{name: "DIV", argLength: 2, reg: gp21, asm: "DIV", clobberFlags: true},    // arg0 / arg1, signed, soft div clobbers flags
+		{name: "DIVU", argLength: 2, reg: gp21, asm: "DIVU", clobberFlags: true},  // arg0 / arg1, unsighed
+		{name: "MOD", argLength: 2, reg: gp21, asm: "MOD", clobberFlags: true},    // arg0 % arg1, signed
+		{name: "MODU", argLength: 2, reg: gp21, asm: "MODU", clobberFlags: true},  // arg0 % arg1, unsigned

-		{name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", rematerializeable: true}, // 32 low bits of auxint
+		{name: "ADDS", argLength: 2, reg: gp21carry, asm: "ADD", commutative: true}, // arg0 + arg1, set carry flag
+		{name: "ADDSconst", argLength: 1, reg: gp11carry, asm: "ADD", aux: "Int32"}, // arg0 + auxInt, set carry flag
+		{name: "ADC", argLength: 3, reg: gp2flags1, asm: "ADC", commutative: true},  // arg0 + arg1 + carry, arg2=flags
+		{name: "ADCconst", argLength: 2, reg: gp1flags1, asm: "ADC", aux: "Int32"},  // arg0 + auxInt + carry, arg1=flags
+		{name: "SUBS", argLength: 2, reg: gp21carry, asm: "SUB"},                    // arg0 - arg1, set carry flag
+		{name: "SUBSconst", argLength: 1, reg: gp11carry, asm: "SUB", aux: "Int32"}, // arg0 - auxInt, set carry flag
+		{name: "RSBSconst", argLength: 1, reg: gp11carry, asm: "RSB", aux: "Int32"}, // auxInt - arg0, set carry flag
+		{name: "SBC", argLength: 3, reg: gp2flags1, asm: "SBC"},                     // arg0 - arg1 - carry, arg2=flags
+		{name: "SBCconst", argLength: 2, reg: gp1flags1, asm: "SBC", aux: "Int32"},  // arg0 - auxInt - carry, arg1=flags
+		{name: "RSCconst", argLength: 2, reg: gp1flags1, asm: "RSC", aux: "Int32"},  // auxInt - arg0 - carry, arg1=flags

-		{name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1
+		{name: "MULLU", argLength: 2, reg: gp22, asm: "MULLU", commutative: true}, // arg0 * arg1, high 32 bits in out0, low 32 bits in out1
+		{name: "MULA", argLength: 3, reg: gp31, asm: "MULA"},                      // arg0 * arg1 + arg2

-		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW"},   // load from arg0 + auxInt + aux.  arg1=mem.
-		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+		{name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+		{name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"},                    // arg0 - arg1
+		{name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"},                    // arg0 - arg1
+		{name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+		{name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+		{name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"},                    // arg0 / arg1
+		{name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"},                    // arg0 / arg1

-		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff"}, // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt
+		{name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true},  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int32"},  // arg0 | auxInt
+		{name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int32"}, // arg0 ^ auxInt
+		{name: "BIC", argLength: 2, reg: gp21, asm: "BIC"},                    // arg0 &^ arg1
+		{name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt
+
+		// unary ops
+		{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
+
+		{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"},   // -arg0, float32
+		{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
+		{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+
+		// shifts
+		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                    // arg0 << arg1, shift amount is mod 256
+		{name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt
+		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                    // arg0 >> arg1, unsigned, shift amount is mod 256
+		{name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, unsigned
+		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                    // arg0 >> arg1, signed, shift amount is mod 256
+		{name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed
+		{name: "SRRconst", argLength: 1, reg: gp11, aux: "Int32"},             // arg0 right rotate by auxInt bits
+
+		{name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt
+		{name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift
+		{name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift
+		{name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt
+		{name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift
+		{name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift
+		{name: "RSBshiftLL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0
+		{name: "RSBshiftRL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift
+		{name: "RSBshiftRA", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift
+		{name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1<<auxInt)
+		{name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), unsigned shift
+		{name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), signed shift
+		{name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"},  // arg0 | arg1<<auxInt
+		{name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"},  // arg0 | arg1>>auxInt, unsigned shift
+		{name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"},  // arg0 | arg1>>auxInt, signed shift
+		{name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1<<auxInt
+		{name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, unsigned shift
+		{name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, signed shift
+		{name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1<<auxInt)
+		{name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), unsigned shift
+		{name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), signed shift
+		{name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0<<auxInt)
+		{name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), unsigned shift
+		{name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), signed shift
+
+		{name: "ADCshiftLL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1<<auxInt + carry, arg2=flags
+		{name: "ADCshiftRL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, unsigned shift, arg2=flags
+		{name: "ADCshiftRA", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, signed shift, arg2=flags
+		{name: "SBCshiftLL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1<<auxInt - carry, arg2=flags
+		{name: "SBCshiftRL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, unsigned shift, arg2=flags
+		{name: "SBCshiftRA", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, signed shift, arg2=flags
+		{name: "RSCshiftLL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1<<auxInt - arg0 - carry, arg2=flags
+		{name: "RSCshiftRL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, unsigned shift, arg2=flags
+		{name: "RSCshiftRA", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, signed shift, arg2=flags
+
+		{name: "ADDSshiftLL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt, set carry flag
+		{name: "ADDSshiftRL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift, set carry flag
+		{name: "ADDSshiftRA", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift, set carry flag
+		{name: "SUBSshiftLL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt, set carry flag
+		{name: "SUBSshiftRL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift, set carry flag
+		{name: "SUBSshiftRA", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift, set carry flag
+		{name: "RSBSshiftLL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0, set carry flag
+		{name: "RSBSshiftRL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift, set carry flag
+		{name: "RSBSshiftRA", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift, set carry flag
+
+		{name: "ADDshiftLLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1<<arg2
+		{name: "ADDshiftRLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift
+		{name: "ADDshiftRAreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift
+		{name: "SUBshiftLLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1<<arg2
+		{name: "SUBshiftRLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift
+		{name: "SUBshiftRAreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift
+		{name: "RSBshiftLLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1<<arg2 - arg0
+		{name: "RSBshiftRLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift
+		{name: "RSBshiftRAreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift
+		{name: "ANDshiftLLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1<<arg2)
+		{name: "ANDshiftRLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), unsigned shift
+		{name: "ANDshiftRAreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), signed shift
+		{name: "ORshiftLLreg", argLength: 3, reg: gp31, asm: "ORR"},  // arg0 | arg1<<arg2
+		{name: "ORshiftRLreg", argLength: 3, reg: gp31, asm: "ORR"},  // arg0 | arg1>>arg2, unsigned shift
+		{name: "ORshiftRAreg", argLength: 3, reg: gp31, asm: "ORR"},  // arg0 | arg1>>arg2, signed shift
+		{name: "XORshiftLLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1<<arg2
+		{name: "XORshiftRLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, unsigned shift
+		{name: "XORshiftRAreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, signed shift
+		{name: "BICshiftLLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1<<arg2)
+		{name: "BICshiftRLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), unsigned shift
+		{name: "BICshiftRAreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), signed shift
+		{name: "MVNshiftLLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0<<arg1)
+		{name: "MVNshiftRLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), unsigned shift
+		{name: "MVNshiftRAreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), signed shift
+
+		{name: "ADCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1<<arg2 + carry, arg3=flags
+		{name: "ADCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, unsigned shift, arg3=flags
+		{name: "ADCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, signed shift, arg3=flags
+		{name: "SBCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1<<arg2 - carry, arg3=flags
+		{name: "SBCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, unsigned shift, arg3=flags
+		{name: "SBCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, signed shift, arg3=flags
+		{name: "RSCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1<<arg2 - arg0 - carry, arg3=flags
+		{name: "RSCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, unsigned shift, arg3=flags
+		{name: "RSCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, signed shift, arg3=flags
+
+		{name: "ADDSshiftLLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1<<arg2, set carry flag
+		{name: "ADDSshiftRLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift, set carry flag
+		{name: "ADDSshiftRAreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift, set carry flag
+		{name: "SUBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1<<arg2, set carry flag
+		{name: "SUBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift, set carry flag
+		{name: "SUBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift, set carry flag
+		{name: "RSBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1<<arg2 - arg0, set carry flag
+		{name: "RSBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift, set carry flag
+		{name: "RSBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift, set carry flag
+
+		// comparisons
+		{name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt
+		{name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags"},                    // arg0 compare to -arg1
+		{name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt
+		{name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0
+		{name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0
+		{name: "TEQ", argLength: 2, reg: gp2flags, asm: "TEQ", typ: "Flags", commutative: true}, // arg0 ^ arg1 compare to 0
+		{name: "TEQconst", argLength: 1, reg: gp1flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ auxInt compare to 0
+		{name: "CMPF", argLength: 2, reg: fp2flags, asm: "CMPF", typ: "Flags"},                  // arg0 compare to arg1, float32
+		{name: "CMPD", argLength: 2, reg: fp2flags, asm: "CMPD", typ: "Flags"},                  // arg0 compare to arg1, float64
+
+		{name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1<<auxInt
+		{name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift
+		{name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift
+
+		{name: "CMPshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1<<arg2
+		{name: "CMPshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, unsigned shift
+		{name: "CMPshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, signed shift
+
+		{name: "CMPF0", argLength: 1, reg: fp1flags, asm: "CMPF", typ: "Flags"}, // arg0 compare to 0, float32
+		{name: "CMPD0", argLength: 1, reg: fp1flags, asm: "CMPD", typ: "Flags"}, // arg0 compare to 0, float64
+
+		// moves
+		{name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true},    // 32 low bits of auxint
+		{name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+		{name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+		{name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32"},   // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64"},  // load from arg0 + auxInt + aux.  arg1=mem.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem"}, // store 1 byte of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+
+		{name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW"},                   // load from arg0 + arg1. arg2=mem
+		{name: "MOVWloadshiftLL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32"}, // load from arg0 + arg1<<auxInt. arg2=mem
+		{name: "MOVWloadshiftRL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32"}, // load from arg0 + arg1>>auxInt, unsigned shift. arg2=mem
+		{name: "MOVWloadshiftRA", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32"}, // load from arg0 + arg1>>auxInt, signed shift. arg2=mem
+
+		{name: "MOVWstoreidx", argLength: 4, reg: gp2store, asm: "MOVW"},                   // store arg2 to arg0 + arg1. arg3=mem
+		{name: "MOVWstoreshiftLL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32"}, // store arg2 to arg0 + arg1<<auxInt. arg3=mem
+		{name: "MOVWstoreshiftRL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32"}, // store arg2 to arg0 + arg1>>auxInt, unsigned shift. arg3=mem
+		{name: "MOVWstoreshiftRA", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32"}, // store arg2 to arg0 + arg1>>auxInt, signed shift. arg3=mem
+
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVBS"},  // move from arg0, sign-extended from byte
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVHS"},  // move from arg0, sign-extended from half
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0
+
+		{name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		{name: "MOVWF", argLength: 1, reg: gpfp, asm: "MOVWF"},  // int32 -> float32
+		{name: "MOVWD", argLength: 1, reg: gpfp, asm: "MOVWD"},  // int32 -> float64
+		{name: "MOVWUF", argLength: 1, reg: gpfp, asm: "MOVWF"}, // uint32 -> float32, set U bit in the instruction
+		{name: "MOVWUD", argLength: 1, reg: gpfp, asm: "MOVWD"}, // uint32 -> float64, set U bit in the instruction
+		{name: "MOVFW", argLength: 1, reg: fpgp, asm: "MOVFW"},  // float32 -> int32
+		{name: "MOVDW", argLength: 1, reg: fpgp, asm: "MOVDW"},  // float64 -> int32
+		{name: "MOVFWU", argLength: 1, reg: fpgp, asm: "MOVFW"}, // float32 -> uint32, set U bit in the instruction
+		{name: "MOVDWU", argLength: 1, reg: fpgp, asm: "MOVDW"}, // float64 -> uint32, set U bit in the instruction
+		{name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"},  // float32 -> float64
+		{name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"},  // float64 -> float32
+
+		// conditional instructions, for lowering shifts
+		{name: "CMOVWHSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates HS, arg1=flags
+		{name: "CMOVWLSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates LS, arg1=flags
+		{name: "SRAcond", argLength: 3, reg: gp2flags1, asm: "SRA"},                                         // arg0 >> 31 if flags indicates HS, arg0 >> arg1 otherwise, signed shift, arg2=flags
+
+		// function calls
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                             // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R7"), 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                               // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                  // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem

 		// pseudo-ops
-		{name: "LessThan", argLength: 1, reg: flagsgp}, // bool, 1 flags encode x<y 0 otherwise.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}}, // panic if arg0 is nil.  arg1=mem.
+
+		{name: "Equal", argLength: 1, reg: readflags},         // bool, true flags encode x==y false otherwise.
+		{name: "NotEqual", argLength: 1, reg: readflags},      // bool, true flags encode x!=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: readflags},      // bool, true flags encode signed x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: readflags},     // bool, true flags encode signed x<=y false otherwise.
+		{name: "GreaterThan", argLength: 1, reg: readflags},   // bool, true flags encode signed x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: readflags},  // bool, true flags encode signed x>=y false otherwise.
+		{name: "LessThanU", argLength: 1, reg: readflags},     // bool, true flags encode unsigned x<y false otherwise.
+		{name: "LessEqualU", argLength: 1, reg: readflags},    // bool, true flags encode unsigned x<=y false otherwise.
+		{name: "GreaterThanU", argLength: 1, reg: readflags},  // bool, true flags encode unsigned x>y false otherwise.
+		{name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise.
+
+		// duffzero (must be 4-byte aligned)
+		// arg0 = address of memory to zero (in R1, changed as side effect)
+		// arg1 = value to store (always zero)
+		// arg2 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), buildReg("R0")},
+				clobbers: buildReg("R1"),
+			},
+		},
+
+		// duffcopy (must be 4-byte aligned)
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// returns mem
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1")},
+				clobbers: buildReg("R0 R1 R2"),
+			},
+		},
+
+		// large or unaligned zeroing
+		// arg0 = address of memory to zero (in R1, changed as side effect)
+		// arg1 = address of the last element to zero
+		// arg2 = value to store (always zero)
+		// arg3 = mem
+		// returns mem
+		//	MOVW.P	Rarg2, 4(R1)
+		//	CMP	R1, Rarg1
+		//	BLE	-2(PC)
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), gp, gp},
+				clobbers: buildReg("R1"),
+			},
+			clobberFlags: true,
+		},
+
+		// large or unaligned move
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// returns mem
+		//	MOVW.P	4(R1), Rtmp
+		//	MOVW.P	Rtmp, 4(R2)
+		//	CMP	R1, Rarg2
+		//	BLE	-3(PC)
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1"), gp},
+				clobbers: buildReg("R1 R2"),
+			},
+			clobberFlags: true,
+		},
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R7 (arm.REGCTXT, the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R7")}}},
+
+		// MOVWconvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GC
+		// (particularly stack maps).  It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		// arg0=ptr/int arg1=mem, output=int/ptr
+		{name: "MOVWconvert", argLength: 2, reg: gp11, asm: "MOVW"},
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned <
+		{name: "FlagGT_ULT"}, // signed > and unsigned >
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
 	}

 	blocks := []blockData{
@@ -47,22 +496,15 @@ func init() {
 		{name: "UGE"},
 	}

-	regNames := []string{
-		"R0",
-		"R1",
-		"R2",
-		"R3",
-		"SP",
-		"FLAGS",
-		"SB",
-	}
-
 	archs = append(archs, arch{
-		name:     "ARM",
-		pkg:      "cmd/internal/obj/arm",
-		genfile:  "../../arm/ssa.go",
-		ops:      ops,
-		blocks:   blocks,
-		regnames: regNames,
+		name:            "ARM",
+		pkg:             "cmd/internal/obj/arm",
+		genfile:         "../../arm/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesARM,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: -1, // not used
 	})
 }
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -0,0 +1,573 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add64  x y) -> (ADD  x y)
+(AddPtr x y) -> (ADD  x y)
+(Add32  x y) -> (ADD x y)
+(Add16  x y) -> (ADD x y)
+(Add8   x y) -> (ADD x y)
+(Add64F x y) -> (FADD x y)
+(Add32F x y) -> (FADDS x y)
+
+(Sub64  x y) -> (SUB  x y)
+(SubPtr x y) -> (SUB  x y)
+(Sub32  x y) -> (SUB x y)
+(Sub16  x y) -> (SUB x y)
+(Sub8   x y) -> (SUB x y)
+(Sub32F x y) -> (FSUBS x y)
+(Sub64F x y) -> (FSUB x y)
+
+(Mod16 x y) -> (Mod32 (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) -> (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) -> (Mod32 (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) -> (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Mod64 x y) -> (SUB x (MULLD y (DIVD x y)))
+(Mod64u x y) -> (SUB x (MULLD y (DIVDU x y)))
+(Mod32 x y) -> (SUB x (MULLW y (DIVW x y)))
+(Mod32u x y) -> (SUB x (MULLW y (DIVWU x y)))
+
+(Avg64u <t> x y) -> (ADD (ADD <t> (SRD <t> x (MOVDconst <t> [1])) (SRD <t> y (MOVDconst <t> [1]))) (ANDconst <t> (AND <t> x y) [1]))
+
+(Mul64  x y) -> (MULLD  x y)
+(Mul32  x y) -> (MULLW  x y)
+(Mul16  x y) -> (MULLW x y)
+(Mul8   x y) -> (MULLW x y)
+
+(Div64  x y) -> (DIVD  x y)
+(Div64u x y) -> (DIVDU x y)
+(Div32  x y) -> (DIVW  x y)
+(Div32u x y) -> (DIVWU x y)
+(Div16  x y) -> (DIVW  (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) -> (DIVWU (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8   x y) -> (DIVW  (SignExt8to32 x) (SignExt8to32 y))
+(Div8u  x y) -> (DIVWU (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(Hmul64  x y) -> (MULHD  x y)
+(Hmul64u  x y) -> (MULHDU x y)
+(Hmul32  x y) -> (MULHW  x y)
+(Hmul32u  x y) -> (MULHWU x y)
+(Hmul16 x y) -> (SRAWconst (MULLW <config.fe.TypeInt32()> (SignExt16to32 x) (SignExt16to32 y)) [16])
+(Hmul16u x y) -> (SRWconst (MULLW <config.fe.TypeUInt32()> (ZeroExt16to32 x) (ZeroExt16to32 y)) [16])
+(Hmul8 x y) -> (SRAWconst (MULLW <config.fe.TypeInt16()> (SignExt8to32 x) (SignExt8to32 y)) [8])
+(Hmul8u x y) -> (SRWconst (MULLW <config.fe.TypeUInt16()> (ZeroExt8to32 x) (ZeroExt8to32 y)) [8])
+
+(Mul32F x y) -> (FMULS x y)
+(Mul64F x y) -> (FMUL x y)
+
+(Div32F x y) -> (FDIVS x y)
+(Div64F x y) -> (FDIV x y)
+
+// Lowering float <-> int
+(Cvt32to32F x) -> (FRSP (FCFID (Xi2f64 (SignExt32to64 x))))
+(Cvt32to64F x) -> (FCFID (Xi2f64 (SignExt32to64 x)))
+(Cvt64to32F x) -> (FRSP (FCFID (Xi2f64 x)))
+(Cvt64to64F x) -> (FCFID (Xi2f64 x))
+
+(Cvt32Fto32 x) -> (Xf2i64 (FCTIWZ x))
+(Cvt32Fto64 x) -> (Xf2i64 (FCTIDZ x))
+(Cvt64Fto32 x) -> (Xf2i64 (FCTIWZ x))
+(Cvt64Fto64 x) -> (Xf2i64 (FCTIDZ x))
+
+(Cvt32Fto64F x) -> x // Note x will have the wrong type for patterns dependent on Float32/Float64
+(Cvt64Fto32F x) -> (FRSP x)
+
+(Sqrt x) -> (FSQRT x)
+
+(Rsh64x64 x y)  -> (SRAD x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] y))))
+(Rsh64Ux64 x y) -> (SRD  x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] y))))
+(Lsh64x64 x y)  -> (SLD  x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] y))))
+
+(Rsh32x64 x y)  -> (SRAW x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] y))))
+(Rsh32Ux64 x y) -> (SRW  x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] y))))
+(Lsh32x64 x y)  -> (SLW  x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] y))))
+
+(Rsh16x64 x y)  -> (SRAW (SignExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] y))))
+(Rsh16Ux64 x y) -> (SRW  (ZeroExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] y))))
+(Lsh16x64 x y)  -> (SLW  x                 (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] y))))
+
+(Rsh8x64 x y)  -> (SRAW (SignExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] y))))
+(Rsh8Ux64 x y) -> (SRW  (ZeroExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] y))))
+(Lsh8x64 x y)  -> (SLW  x                (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] y))))
+
+
+(Rsh64x32 x y)  -> (SRAD x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt32to64 y)))))
+(Rsh64Ux32 x y) -> (SRD x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt32to64 y)))))
+(Lsh64x32 x y)  -> (SLD x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt32to64 y)))))
+
+(Rsh32x32 x y)  -> (SRAW x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt32to64 y)))))
+(Rsh32Ux32 x y) -> (SRW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt32to64 y)))))
+(Lsh32x32 x y)  -> (SLW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt32to64 y)))))
+
+(Rsh16x32 x y)  -> (SRAW (SignExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt32to64 y)))))
+(Rsh16Ux32 x y) -> (SRW  (ZeroExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt32to64 y)))))
+(Lsh16x32 x y)  -> (SLW  x                 (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt32to64 y)))))
+
+(Rsh8x32 x y)  -> (SRAW (SignExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt32to64 y)))))
+(Rsh8Ux32 x y) -> (SRW  (ZeroExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt32to64 y)))))
+(Lsh8x32 x y)  -> (SLW  x                (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt32to64 y)))))
+
+
+(Rsh64x16 x y)  -> (SRAD x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt16to64 y)))))
+(Rsh64Ux16 x y) -> (SRD x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt16to64 y)))))
+(Lsh64x16 x y)  -> (SLD x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt16to64 y)))))
+
+(Rsh32x16 x y)  -> (SRAW x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt16to64 y)))))
+(Rsh32Ux16 x y) -> (SRW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt16to64 y)))))
+(Lsh32x16 x y)  -> (SLW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt16to64 y)))))
+
+(Rsh16x16 x y)  -> (SRAW (SignExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt16to64 y)))))
+(Rsh16Ux16 x y) -> (SRW  (ZeroExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt16to64 y)))))
+(Lsh16x16 x y)  -> (SLW  x                 (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt16to64 y)))))
+
+(Rsh8x16 x y)  -> (SRAW (SignExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt16to64 y)))))
+(Rsh8Ux16 x y) -> (SRW  (ZeroExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt16to64 y)))))
+(Lsh8x16 x y)  -> (SLW  x                (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt16to64 y)))))
+
+
+(Rsh64x8 x y)  -> (SRAD x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt8to64 y)))))
+(Rsh64Ux8 x y) -> (SRD x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt8to64 y)))))
+(Lsh64x8 x y)  -> (SLD x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-64] (ZeroExt8to64 y)))))
+
+(Rsh32x8 x y)  -> (SRAW x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt8to64 y)))))
+(Rsh32Ux8 x y) -> (SRW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt8to64 y)))))
+(Lsh32x8 x y)  -> (SLW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-32] (ZeroExt8to64 y)))))
+
+(Rsh16x8 x y)  -> (SRAW (SignExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt8to64 y)))))
+(Rsh16Ux8 x y) -> (SRW  (ZeroExt16to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt8to64 y)))))
+(Lsh16x8 x y)  -> (SLW  x                 (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-16] (ZeroExt8to64 y)))))
+
+(Rsh8x8 x y)  -> (SRAW (SignExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt8to64 y)))))
+(Rsh8Ux8 x y) -> (SRW  (ZeroExt8to32 x) (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt8to64 y)))))
+(Lsh8x8 x y)  -> (SLW  x                (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDconstForCarry [-8] (ZeroExt8to64 y)))))
+
+// Potentially useful optimizing rewrites.
+// (ADDconstForCarry [k] c), k < 0 && (c < 0 || k+c >= 0) -> CarrySet
+// (ADDconstForCarry [k] c), K < 0 && (c >= 0 && k+c < 0) -> CarryClear
+// (MaskIfNotCarry CarrySet) -> 0
+// (MaskIfNotCarry CarrySet) -> -1
+
+// Lowering constants
+(Const8   [val]) -> (MOVWconst [val])
+(Const16  [val]) -> (MOVWconst [val])
+(Const32  [val]) -> (MOVWconst [val])
+(Const64  [val]) -> (MOVDconst [val])
+(Const32F [val]) -> (FMOVSconst [val])
+(Const64F [val]) -> (FMOVDconst [val])
+(ConstNil) -> (MOVDconst [0])
+(ConstBool [b]) -> (MOVWconst [b])
+
+(Addr {sym} base) -> (MOVDaddr {sym} base)
+// (Addr {sym} base) -> (ADDconst {sym} base)
+(OffPtr [off] ptr) -> (ADD (MOVDconst <config.Frontend().TypeInt64()> [off]) ptr)
+
+(And64 x y) -> (AND x y)
+(And32 x y) -> (AND x y)
+(And16 x y) -> (AND x y)
+(And8  x y) -> (AND x y)
+
+(Or64 x y) -> (OR x y)
+(Or32 x y) -> (OR x y)
+(Or16 x y) -> (OR x y)
+(Or8  x y) -> (OR x y)
+
+(Xor64 x y) -> (XOR x y)
+(Xor32 x y) -> (XOR x y)
+(Xor16 x y) -> (XOR x y)
+(Xor8  x y) -> (XOR x y)
+
+(Neg64F x) -> (FNEG x)
+(Neg32F x) -> (FNEG x)
+(Neg64  x) -> (NEG x)
+(Neg32  x) -> (NEG x)
+(Neg16  x) -> (NEG x)
+(Neg8   x) -> (NEG x)
+
+(Com64 x) -> (XORconst [-1] x)
+(Com32 x) -> (XORconst [-1] x)
+(Com16 x) -> (XORconst [-1] x)
+(Com8  x) -> (XORconst [-1] x)
+
+// Lowering boolean ops
+(AndB x y) -> (AND x y)
+(OrB x y) -> (OR x y)
+(Not x) -> (XORconst [1] x)
+
+// Lowering comparisons
+(EqB x y)  -> (ANDconst [1] (EQV x y))
+(Eq8 x y)  -> (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) -> (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) -> (Equal (CMPW x y))
+(Eq64 x y) -> (Equal (CMP x y))
+(Eq32F x y) -> (Equal (FCMPU x y))
+(Eq64F x y) -> (Equal (FCMPU x y))
+(EqPtr x y) -> (Equal (CMP x y))
+
+(NeqB x y)  -> (XOR x y)
+(Neq8 x y)  -> (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) -> (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) -> (NotEqual (CMPW x y))
+(Neq64 x y) -> (NotEqual (CMP x y))
+(Neq32F x y) -> (NotEqual (FCMPU x y))
+(Neq64F x y) -> (NotEqual (FCMPU x y))
+(NeqPtr x y) -> (NotEqual (CMP x y))
+
+(Less8 x y)  -> (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) -> (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) -> (LessThan (CMPW x y))
+(Less64 x y) -> (LessThan (CMP x y))
+(Less32F x y) -> (FLessThan (FCMPU x y))
+(Less64F x y) -> (FLessThan (FCMPU x y))
+
+(Less8U x y)  -> (LessThan (CMPWU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) -> (LessThan (CMPWU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) -> (LessThan (CMPWU x y))
+(Less64U x y) -> (LessThan (CMPU x y))
+
+(Leq8 x y)  -> (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) -> (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) -> (LessEqual (CMPW x y))
+(Leq64 x y) -> (LessEqual (CMP x y))
+(Leq32F x y) -> (FLessEqual (FCMPU x y))
+(Leq64F x y) -> (FLessEqual (FCMPU x y))
+
+(Leq8U x y)  -> (LessEqual (CMPWU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) -> (LessEqual (CMPWU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) -> (LessEqual (CMPWU x y))
+(Leq64U x y) -> (LessEqual (CMPU x y))
+
+(Greater8 x y)  -> (GreaterThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Greater16 x y) -> (GreaterThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Greater32 x y) -> (GreaterThan (CMPW x y))
+(Greater64 x y) -> (GreaterThan (CMP x y))
+(Greater32F x y) -> (FGreaterThan (FCMPU x y))
+(Greater64F x y) -> (FGreaterThan (FCMPU x y))
+
+(Greater8U x y)  -> (GreaterThan (CMPWU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Greater16U x y) -> (GreaterThan (CMPWU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Greater32U x y) -> (GreaterThan (CMPWU x y))
+(Greater64U x y) -> (GreaterThan (CMPU x y))
+
+(Geq8 x y)  -> (GreaterEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Geq16 x y) -> (GreaterEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Geq32 x y) -> (GreaterEqual (CMPW x y))
+(Geq64 x y) -> (GreaterEqual (CMP x y))
+(Geq32F x y) -> (FGreaterEqual (FCMPU x y))
+(Geq64F x y) -> (FGreaterEqual (FCMPU x y))
+
+(Geq8U x y)  -> (GreaterEqual (CMPU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Geq16U x y) -> (GreaterEqual (CMPU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Geq32U x y) -> (GreaterEqual (CMPU x y))
+(Geq64U x y) -> (GreaterEqual (CMPU x y))
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) -> (EQ cc yes no)
+(If (NotEqual cc) yes no) -> (NE cc yes no)
+(If (LessThan cc) yes no) -> (LT cc yes no)
+(If (LessEqual cc) yes no) -> (LE cc yes no)
+(If (GreaterThan cc) yes no) -> (GT cc yes no)
+(If (GreaterEqual cc) yes no) -> (GE cc yes no)
+(If (FLessThan cc) yes no) -> (FLT cc yes no)
+(If (FLessEqual cc) yes no) -> (FLE cc yes no)
+(If (FGreaterThan cc) yes no) -> (FGT cc yes no)
+(If (FGreaterEqual cc) yes no) -> (FGE cc yes no)
+
+(If cond yes no) -> (NE (CMPWconst [0] cond) yes no)
+
+// Absorb boolean tests into block
+(NE (CMPWconst [0] (Equal cc)) yes no) -> (EQ cc yes no)
+(NE (CMPWconst [0] (NotEqual cc)) yes no) -> (NE cc yes no)
+(NE (CMPWconst [0] (LessThan cc)) yes no) -> (LT cc yes no)
+(NE (CMPWconst [0] (LessEqual cc)) yes no) -> (LE cc yes no)
+(NE (CMPWconst [0] (GreaterThan cc)) yes no) -> (GT cc yes no)
+(NE (CMPWconst [0] (GreaterEqual cc)) yes no) -> (GE cc yes no)
+// (NE (CMPWconst [0] (FLessThan cc)) yes no) -> (FLT cc yes no)
+// (NE (CMPWconst [0] (FLessEqual cc)) yes no) -> (FLE cc yes no)
+// (NE (CMPWconst [0] (FGreaterThan cc)) yes no) -> (FGT cc yes no)
+// (NE (CMPWconst [0] (FGreaterEqual cc)) yes no) -> (FGE cc yes no)
+
+// absorb flag constants into branches
+(EQ (FlagEQ) yes no) -> (First nil yes no)
+(EQ (FlagLT) yes no) -> (First nil no yes)
+(EQ (FlagGT) yes no) -> (First nil no yes)
+
+(NE (FlagEQ) yes no) -> (First nil no yes)
+(NE (FlagLT) yes no) -> (First nil yes no)
+(NE (FlagGT) yes no) -> (First nil yes no)
+
+(LT (FlagEQ) yes no) -> (First nil no yes)
+(LT (FlagLT) yes no) -> (First nil yes no)
+(LT (FlagGT) yes no) -> (First nil no yes)
+
+(LE (FlagEQ) yes no) -> (First nil yes no)
+(LE (FlagLT) yes no) -> (First nil yes no)
+(LE (FlagGT) yes no) -> (First nil no yes)
+
+(GT (FlagEQ) yes no) -> (First nil no yes)
+(GT (FlagLT) yes no) -> (First nil no yes)
+(GT (FlagGT) yes no) -> (First nil yes no)
+
+(GE (FlagEQ) yes no) -> (First nil yes no)
+(GE (FlagLT) yes no) -> (First nil no yes)
+(GE (FlagGT) yes no) -> (First nil yes no)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) -> (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) -> (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) -> (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) -> (LE cmp yes no)
+(EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) -> (NE cmp yes no)
+
+// (FLT (InvertFlags cmp) yes no) -> (FGT cmp yes no)
+// (FGT (InvertFlags cmp) yes no) -> (FLT cmp yes no)
+// (FLE (InvertFlags cmp) yes no) -> (FGE cmp yes no)
+// (FGE (InvertFlags cmp) yes no) -> (FLE cmp yes no)
+
+// constant comparisons
+(CMPWconst (MOVWconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
+(CMPWconst (MOVWconst [x]) [y]) && int32(x)<int32(y)  -> (FlagLT)
+(CMPWconst (MOVWconst [x]) [y]) && int32(x)>int32(y)  -> (FlagGT)
+
+(CMPconst (MOVDconst [x]) [y]) && int64(x)==int64(y) -> (FlagEQ)
+(CMPconst (MOVDconst [x]) [y]) && int64(x)<int64(y)  -> (FlagLT)
+(CMPconst (MOVDconst [x]) [y]) && int64(x)>int64(y)  -> (FlagGT)
+
+(CMPWUconst (MOVWconst [x]) [y]) && int32(x)==int32(y)  -> (FlagEQ)
+(CMPWUconst (MOVWconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
+(CMPWUconst (MOVWconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)
+
+(CMPUconst (MOVDconst [x]) [y]) && int64(x)==int64(y)  -> (FlagEQ)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) -> (FlagLT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) -> (FlagGT)
+
+// other known comparisons
+//(CMPconst (MOVBUreg _) [c]) && 0xff < c -> (FlagLT)
+//(CMPconst (MOVHUreg _) [c]) && 0xffff < c -> (FlagLT)
+//(CMPconst (ANDconst _ [m]) [n]) && 0 <= int32(m) && int32(m) < int32(n) -> (FlagLT)
+//(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) -> (FlagLT)
+
+// absorb flag constants into boolean values
+(Equal (FlagEQ)) -> (MOVWconst [1])
+(Equal (FlagLT)) -> (MOVWconst [0])
+(Equal (FlagGT)) -> (MOVWconst [0])
+
+(NotEqual (FlagEQ)) -> (MOVWconst [0])
+(NotEqual (FlagLT)) -> (MOVWconst [1])
+(NotEqual (FlagGT)) -> (MOVWconst [1])
+
+(LessThan (FlagEQ)) -> (MOVWconst [0])
+(LessThan (FlagLT)) -> (MOVWconst [1])
+(LessThan (FlagGT)) -> (MOVWconst [0])
+
+(LessEqual (FlagEQ)) -> (MOVWconst [1])
+(LessEqual (FlagLT)) -> (MOVWconst [1])
+(LessEqual (FlagGT)) -> (MOVWconst [0])
+
+(GreaterThan (FlagEQ)) -> (MOVWconst [0])
+(GreaterThan (FlagLT)) -> (MOVWconst [0])
+(GreaterThan (FlagGT)) -> (MOVWconst [1])
+
+(GreaterEqual (FlagEQ)) -> (MOVWconst [1])
+(GreaterEqual (FlagLT)) -> (MOVWconst [0])
+(GreaterEqual (FlagGT)) -> (MOVWconst [1])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) -> (Equal x)
+(NotEqual (InvertFlags x)) -> (NotEqual x)
+(LessThan (InvertFlags x)) -> (GreaterThan x)
+(GreaterThan (InvertFlags x)) -> (LessThan x)
+(LessEqual (InvertFlags x)) -> (GreaterEqual x)
+(GreaterEqual (InvertFlags x)) -> (LessEqual x)
+(FLessThan (InvertFlags x)) -> (FGreaterThan x)
+(FGreaterThan (InvertFlags x)) -> (FLessThan x)
+(FLessEqual (InvertFlags x)) -> (FGreaterEqual x)
+(FGreaterEqual (InvertFlags x)) -> (FLessEqual x)
+
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) -> (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) -> (MOVWZload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) -> (MOVHload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) -> (MOVHZload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && isSigned(t))) -> (MOVBload ptr mem)
+(Load <t> ptr mem) && is8BitInt(t) && !isSigned(t) -> (MOVBZload ptr mem)
+
+(Load <t> ptr mem) && is32BitFloat(t) -> (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) -> (FMOVDload ptr mem)
+
+(Store [8] ptr val mem) && is64BitFloat(val.Type) -> (FMOVDstore ptr val mem)
+(Store [8] ptr val mem) && is32BitFloat(val.Type) -> (FMOVDstore ptr val mem) // glitch from (Cvt32Fto64F x) -> x -- type is wrong
+(Store [4] ptr val mem) && is32BitFloat(val.Type) -> (FMOVSstore ptr val mem)
+(Store [8] ptr val mem) && (is64BitInt(val.Type) || isPtr(val.Type)) -> (MOVDstore ptr val mem)
+(Store [4] ptr val mem) && is32BitInt(val.Type) -> (MOVWstore ptr val mem)
+(Store [2] ptr val mem) -> (MOVHstore ptr val mem)
+(Store [1] ptr val mem) -> (MOVBstore ptr val mem)
+
+(Zero [s] _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstorezero destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 2 && SizeAndAlign(s).Align()%2 == 0 ->
+	(MOVHstorezero destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 2 ->
+	(MOVBstorezero [1] destptr
+		(MOVBstorezero [0] destptr mem))
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%4 == 0 ->
+	(MOVWstorezero destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%2 == 0 ->
+	(MOVHstorezero [2] destptr
+		(MOVHstorezero [0] destptr mem))
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 4 ->
+	(MOVBstorezero [3] destptr
+		(MOVBstorezero [2] destptr
+			(MOVBstorezero [1] destptr
+				(MOVBstorezero [0] destptr mem))))
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%8 == 0 ->
+	(MOVDstorezero [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%4 == 0 ->
+	(MOVWstorezero [4] destptr
+		(MOVWstorezero [0] destptr mem))
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%2 == 0 ->
+	(MOVHstorezero [6] destptr
+		(MOVHstorezero [4] destptr
+			(MOVHstorezero [2] destptr
+				(MOVHstorezero [0] destptr mem))))
+
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 3 ->
+	(MOVBstorezero [2] destptr
+		(MOVBstorezero [1] destptr
+			(MOVBstorezero [0] destptr mem)))
+
+// Zero small numbers of words directly.
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 16 && SizeAndAlign(s).Align()%8 == 0 ->
+	(MOVDstorezero [8] destptr
+                (MOVDstorezero [0] destptr mem))
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 24 && SizeAndAlign(s).Align()%8 == 0 ->
+	(MOVDstorezero [16] destptr
+		(MOVDstorezero [8] destptr
+			(MOVDstorezero [0] destptr mem)))
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 32 && SizeAndAlign(s).Align()%8 == 0 ->
+	(MOVDstorezero [24] destptr
+		(MOVDstorezero [16] destptr
+			(MOVDstorezero [8] destptr
+				(MOVDstorezero [0] destptr mem))))
+
+// Large zeroing uses a loop
+(Zero [s] ptr mem)
+	&& (SizeAndAlign(s).Size() > 512 || config.noDuffDevice) || SizeAndAlign(s).Align()%8 != 0 ->
+	(LoweredZero [SizeAndAlign(s).Align()]
+		ptr
+		(ADDconst <ptr.Type> ptr [SizeAndAlign(s).Size()-moveSize(SizeAndAlign(s).Align(), config)])
+		mem)
+
+// moves
+(Move [s] _ _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBZload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 && SizeAndAlign(s).Align()%2 == 0 ->
+	(MOVHstore dst (MOVHZload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 ->
+	(MOVBstore [1] dst (MOVBZload [1] src mem)
+		(MOVBstore dst (MOVBZload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%4 == 0 ->
+	(MOVWstore dst (MOVWload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 && SizeAndAlign(s).Align()%2 == 0 ->
+	(MOVHstore [2] dst (MOVHZload [2] src mem)
+		(MOVHstore dst (MOVHZload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 ->
+	(MOVBstore [3] dst (MOVBZload [3] src mem)
+		(MOVBstore [2] dst (MOVBZload [2] src mem)
+			(MOVBstore [1] dst (MOVBZload [1] src mem)
+				(MOVBstore dst (MOVBZload src mem) mem))))
+
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%8 == 0 ->
+	(MOVDstore dst (MOVDload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%4 == 0 ->
+	(MOVWstore [4] dst (MOVWZload [4] src mem)
+		(MOVWstore dst (MOVWZload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 && SizeAndAlign(s).Align()%2 == 0->
+	(MOVHstore [6] dst (MOVHZload [6] src mem)
+		(MOVHstore [4] dst (MOVHZload [4] src mem)
+			(MOVHstore [2] dst (MOVHZload [2] src mem)
+				(MOVHstore dst (MOVHZload src mem) mem))))
+
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
+	(MOVBstore [2] dst (MOVBZload [2] src mem)
+		(MOVBstore [1] dst (MOVBZload [1] src mem)
+			(MOVBstore dst (MOVBZload src mem) mem)))
+
+// Large move uses a loop
+(Move [s] dst src mem)
+	&& (SizeAndAlign(s).Size() > 512 || config.noDuffDevice) || SizeAndAlign(s).Align()%8 != 0 ->
+	(LoweredMove [SizeAndAlign(s).Align()]
+		dst
+		src
+		(ADDconst <src.Type> src [SizeAndAlign(s).Size()-moveSize(SizeAndAlign(s).Align(), config)])
+		mem)
+
+// Calls
+// Lowering calls
+(StaticCall [argwid] {target} mem) -> (CALLstatic [argwid] {target} mem)
+(ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
+(DeferCall [argwid] mem) -> (CALLdefer [argwid] mem)
+(GoCall [argwid] mem) -> (CALLgo [argwid] mem)
+(InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)
+
+// Miscellaneous
+(Convert <t> x mem) -> (MOVDconvert <t> x mem)
+(GetClosurePtr) -> (LoweredGetClosurePtr)
+(IsNonNil ptr) -> (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) -> (LessThan (CMPU idx len))
+(IsSliceInBounds idx len) -> (LessEqual (CMPU idx len))
+(NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
+
+// Optimizations
+
+(ADD (MOVDconst [c]) x) && int64(int32(c)) == c -> (ADDconst [c] x)
+(ADD x (MOVDconst [c])) && int64(int32(c)) == c -> (ADDconst [c] x)
+
+// Fold offsets for stores.
+(MOVDstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(off1+off2) -> (MOVDstore [off1+off2] {sym} x val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(off1+off2) -> (MOVWstore [off1+off2] {sym} x val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(off1+off2) -> (MOVHstore [off1+off2] {sym} x val mem)
+(MOVBstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(off1+off2) -> (MOVBstore [off1+off2] {sym} x val mem)
+
+// Store of zero -> storezero
+(MOVDstore [off] {sym} ptr (MOVDconst [c]) mem) && c == 0 -> (MOVDstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [c]) mem) && c == 0 -> (MOVWstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [c]) mem) && c == 0 -> (MOVHstorezero [off] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVDconst [c]) mem) && c == 0 -> (MOVBstorezero [off] {sym} ptr mem)
+
+// Fold offsets for storezero
+(MOVDstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) ->
+    (MOVDstorezero [off1+off2] {sym} x mem)
+(MOVWstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) ->
+    (MOVWstorezero [off1+off2] {sym} x mem)
+(MOVHstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) ->
+    (MOVHstorezero [off1+off2] {sym} x mem)
+(MOVBstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) ->
+    (MOVBstorezero [off1+off2] {sym} x mem)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to16  x) -> (MOVBreg x)
+(SignExt8to32  x) -> (MOVBreg x)
+(SignExt8to64  x) -> (MOVBreg x)
+(SignExt16to32 x) -> (MOVHreg x)
+(SignExt16to64 x) -> (MOVHreg x)
+(SignExt32to64 x) -> (MOVWreg x)
+
+(ZeroExt8to16  x) -> (MOVBZreg x)
+(ZeroExt8to32  x) -> (MOVBZreg x)
+(ZeroExt8to64  x) -> (MOVBZreg x)
+(ZeroExt16to32 x) -> (MOVHZreg x)
+(ZeroExt16to64 x) -> (MOVHZreg x)
+(ZeroExt32to64 x) -> (MOVWZreg x)
+
+(Trunc16to8  x) -> (MOVBreg x)
+(Trunc32to8  x) -> (MOVBreg x)
+(Trunc32to16 x) -> (MOVHreg x)
+(Trunc64to8  x) -> (MOVBreg x)
+(Trunc64to16 x) -> (MOVHreg x)
+(Trunc64to32 x) -> (MOVWreg x)
+
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -0,0 +1,395 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Less-than-64-bit integer types live in the low portion of registers.
+//    For now, the upper portion is junk; sign/zero-extension might be optimized in the future, but not yet.
+//  - Boolean types are zero or 1; stored in a byte, but loaded with AMOVBZ so the upper bytes of a register are zero.
+//  - *const instructions may use a constant larger than the instuction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R31).
+
+var regNamesPPC64 = []string{
+	// "R0", // REGZERO
+	"SP", // REGSP
+	"SB", // REGSB
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11", // REGCTXT for closures
+	"R12",
+	"R13", // REGTLS
+	"R14",
+	"R15",
+	"R16",
+	"R17",
+	"R18",
+	"R19",
+	"R20",
+	"R21",
+	"R22",
+	"R23",
+	"R24",
+	"R25",
+	"R26",
+	"R27",
+	"R28",
+	"R29",
+	"g",   // REGG.  Using name "g" and setting Config.hasGReg makes it "just happen".
+	"R31", // REGTMP
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+	"F16",
+	"F17",
+	"F18",
+	"F19",
+	"F20",
+	"F21",
+	"F22",
+	"F23",
+	"F24",
+	"F25",
+	"F26",
+	// "F27", // reserved for "floating conversion constant"
+	// "F28", // 0.0
+	// "F29", // 0.5
+	// "F30", // 1.0
+	// "F31", // 2.0
+
+	// "CR0",
+	// "CR1",
+	// "CR2",
+	// "CR3",
+	// "CR4",
+	// "CR5",
+	// "CR6",
+	// "CR7",
+
+	// "CR",
+	// "XER",
+	// "LR",
+	// "CTR",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesPPC64) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesPPC64 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	var (
+		gp = buildReg("R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29")
+		fp = buildReg("F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26")
+		sp = buildReg("SP")
+		sb = buildReg("SB")
+		// gr  = buildReg("g")
+		// cr  = buildReg("CR")
+		// ctr = buildReg("CTR")
+		// lr  = buildReg("LR")
+		tmp  = buildReg("R31")
+		ctxt = buildReg("R11")
+		// tls = buildReg("R13")
+		gp01        = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11        = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
+		gp21        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+		gp1cr       = regInfo{inputs: []regMask{gp | sp | sb}}
+		gp2cr       = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
+		crgp        = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gpload      = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
+		gpstore     = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
+		gpstorezero = regInfo{inputs: []regMask{gp | sp | sb}} // ppc64.REGZERO is reserved zero value
+		fp01        = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11        = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		fpgp        = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+		gpfp        = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+		fp21        = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp2cr       = regInfo{inputs: []regMask{fp, fp}}
+		fpload      = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{fp}}
+		fpstore     = regInfo{inputs: []regMask{gp | sp | sb, fp}}
+		callerSave  = regMask(gp | fp)
+	)
+	ops := []opData{
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},     // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11, asm: "ADD", aux: "SymOff"},    // arg0 + auxInt + aux.(*gc.Sym)
+		{name: "FADD", argLength: 2, reg: fp21, asm: "FADD", commutative: true},   // arg0+arg1
+		{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0+arg1
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                        // arg0-arg1
+		{name: "FSUB", argLength: 2, reg: fp21, asm: "FSUB"},                      // arg0-arg1
+		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"},                    // arg0-arg1
+
+		{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit)
+		{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit)
+
+		{name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true},   // (arg0 * arg1) >> 64, signed
+		{name: "MULHW", argLength: 2, reg: gp21, asm: "MULHW", commutative: true},   // (arg0 * arg1) >> 32, signed
+		{name: "MULHDU", argLength: 2, reg: gp21, asm: "MULHDU", commutative: true}, // (arg0 * arg1) >> 64, unsigned
+		{name: "MULHWU", argLength: 2, reg: gp21, asm: "MULHWU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+
+		{name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true},   // arg0*arg1
+		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1
+
+		{name: "SRAD", argLength: 2, reg: gp21, asm: "SRAD"}, // arg0 >>a arg1, 64 bits (all sign if arg1 & 64 != 0)
+		{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >>a arg1, 32 bits (all sign if arg1 & 32 != 0)
+		{name: "SRD", argLength: 2, reg: gp21, asm: "SRD"},   // arg0 >> arg1, 64 bits  (0 if arg1 & 64 != 0)
+		{name: "SRW", argLength: 2, reg: gp21, asm: "SRW"},   // arg0 >> arg1, 32 bits  (0 if arg1 & 32 != 0)
+		{name: "SLD", argLength: 2, reg: gp21, asm: "SLD"},   // arg0 << arg1, 64 bits  (0 if arg1 & 64 != 0)
+		{name: "SLW", argLength: 2, reg: gp21, asm: "SLW"},   // arg0 << arg1, 32 bits  (0 if arg1 & 32 != 0)
+
+		{name: "ADDconstForCarry", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, aux: "Int16", asm: "ADDC", typ: "Flags"}, // _, carry := arg0 + aux
+		{name: "MaskIfNotCarry", argLength: 1, reg: crgp, asm: "ADDME", typ: "Int64"},                                                                   // carry - 1 (if carry then 0 else -1)
+
+		{name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "Int64"}, // arg0 >>a aux, 64 bits
+		{name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "Int64"}, // arg0 >>a aux, 32 bits
+		{name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "Int64"},   // arg0 >> aux, 64 bits
+		{name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "Int64"},   // arg0 >> aux, 32 bits
+		{name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "Int64"},   // arg0 << aux, 64 bits
+		{name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "Int64"},   // arg0 << aux, 32 bits
+
+		{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"},   // arg0/arg1
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1
+
+		{name: "DIVD", argLength: 2, reg: gp21, asm: "DIVD", typ: "Int64"},   // arg0/arg1 (signed 64-bit)
+		{name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"},   // arg0/arg1 (signed 32-bit)
+		{name: "DIVDU", argLength: 2, reg: gp21, asm: "DIVDU", typ: "Int64"}, // arg0/arg1 (unsigned 64-bit)
+		{name: "DIVWU", argLength: 2, reg: gp21, asm: "DIVWU", typ: "Int32"}, // arg0/arg1 (unsigned 32-bit)
+
+		// MOD is implemented as rem := arg0 - (arg0/arg1) * arg1
+
+		// Conversions are all float-to-float register operations.  "Integer" refers to encoding in the FP register.
+		{name: "FCTIDZ", argLength: 1, reg: fp11, asm: "FCTIDZ", typ: "Float64"}, // convert float to 64-bit int round towards zero
+		{name: "FCTIWZ", argLength: 1, reg: fp11, asm: "FCTIWZ", typ: "Float64"}, // convert float to 32-bit int round towards zero
+		{name: "FCFID", argLength: 1, reg: fp11, asm: "FCFID", typ: "Float64"},   // convert 64-bit integer to float
+		{name: "FRSP", argLength: 1, reg: fp11, asm: "FRSP", typ: "Float64"},     // round float to 32-bit value
+
+		// Movement between float and integer registers with no change in bits; accomplished with stores+loads on PPC.
+		// Because the 32-bit load-literal-bits instructions have impoverished addressability, always widen the
+		// data instead and use FMOVDload and FMOVDstore instead (this will also dodge endianess issues).
+		// There are optimizations that should apply -- (Xi2f64 (MOVWload (not-ADD-ptr+offset) ) ) could use
+		// the word-load instructions.  (Xi2f64 (MOVDload ptr )) can be (FMOVDload ptr)
+
+		{name: "Xf2i64", argLength: 1, reg: fpgp, typ: "Int64"},   // move 64 bits of F register into G register
+		{name: "Xi2f64", argLength: 1, reg: gpfp, typ: "Float64"}, // move 64 bits of G register into F register
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},               // arg0&arg1
+		{name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"},                                // arg0&^arg1
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},                 // arg0|arg1
+		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                                  // arg0|^arg1
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", typ: "Int64", commutative: true}, // arg0^arg1
+		{name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true}, // arg0^^arg1
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                                  // -arg0 (integer)
+		{name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"},                                // -arg0 (floating point)
+		{name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"},                              // sqrt(arg0) (floating point)
+		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"},                            // sqrt(arg0) (floating point, single precision)
+
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"},                                                                                     // arg0|aux
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64"},                                                                                   // arg0^aux
+		{name: "ANDconst", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}, asm: "ANDCC", aux: "Int64", clobberFlags: true}, // arg0&aux // and-immediate sets CC on PPC, always.
+
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB", typ: "Int64"},                      // sign extend int8 to int64
+		{name: "MOVBZreg", argLength: 1, reg: gp11, asm: "MOVBZ", typ: "Int64"},                    // zero extend uint8 to uint64
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH", typ: "Int64"},                      // sign extend int16 to int64
+		{name: "MOVHZreg", argLength: 1, reg: gp11, asm: "MOVHZ", typ: "Int64"},                    // zero extend uint16 to uint64
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW", typ: "Int64"},                      // sign extend int32 to int64
+		{name: "MOVWZreg", argLength: 1, reg: gp11, asm: "MOVWZ", typ: "Int64"},                    // zero extend uint32 to uint64
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", typ: "Int8"},     // sign extend int8 to int64
+		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8"},  // zero extend uint8 to uint64
+		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16"},    // sign extend int16 to int64
+		{name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16"}, // zero extend uint16 to uint64
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32"},    // sign extend int32 to int64
+		{name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32"}, // zero extend uint32 to uint64
+		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "Int64"},
+
+		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", typ: "Float64"},
+		{name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", typ: "Float32"},
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"},
+		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem"},
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"},
+		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem"},
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", typ: "Mem"},
+		{name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", typ: "Mem"},
+
+		{name: "MOVBstorezero", argLength: 2, reg: gpstorezero, asm: "MOVB", aux: "SymOff", typ: "Mem"}, // store zero byte to arg0+aux.  arg1=mem
+		{name: "MOVHstorezero", argLength: 2, reg: gpstorezero, asm: "MOVH", aux: "SymOff", typ: "Mem"}, // store zero 2 bytes to ...
+		{name: "MOVWstorezero", argLength: 2, reg: gpstorezero, asm: "MOVW", aux: "SymOff", typ: "Mem"}, // store zero 4 bytes to ...
+		{name: "MOVDstorezero", argLength: 2, reg: gpstorezero, asm: "MOVD", aux: "SymOff", typ: "Mem"}, // store zero 8 bytes to ...
+
+		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", rematerializeable: true},     //
+		{name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", rematerializeable: true},     // 32 low bits of auxint
+		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", rematerializeable: true}, //
+		{name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float32", asm: "FMOVS", rematerializeable: true}, //
+		{name: "FCMPU", argLength: 2, reg: fp2cr, asm: "FCMPU", typ: "Flags"},
+
+		{name: "CMP", argLength: 2, reg: gp2cr, asm: "CMP", typ: "Flags"},     // arg0 compare to arg1
+		{name: "CMPU", argLength: 2, reg: gp2cr, asm: "CMPU", typ: "Flags"},   // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2cr, asm: "CMPW", typ: "Flags"},   // arg0 compare to arg1
+		{name: "CMPWU", argLength: 2, reg: gp2cr, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1
+		{name: "CMPconst", argLength: 1, reg: gp1cr, asm: "CMP", aux: "Int64", typ: "Flags"},
+		{name: "CMPUconst", argLength: 1, reg: gp1cr, asm: "CMPU", aux: "Int64", typ: "Flags"},
+		{name: "CMPWconst", argLength: 1, reg: gp1cr, asm: "CMPW", aux: "Int32", typ: "Flags"},
+		{name: "CMPWUconst", argLength: 1, reg: gp1cr, asm: "CMPWU", aux: "Int32", typ: "Flags"},
+
+		// pseudo-ops
+		{name: "Equal", argLength: 1, reg: crgp},         // bool, true flags encode x==y false otherwise.
+		{name: "NotEqual", argLength: 1, reg: crgp},      // bool, true flags encode x!=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: crgp},      // bool, true flags encode  x<y false otherwise.
+		{name: "FLessThan", argLength: 1, reg: crgp},     // bool, true flags encode  x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: crgp},     // bool, true flags encode  x<=y false otherwise.
+		{name: "FLessEqual", argLength: 1, reg: crgp},    // bool, true flags encode  x<=y false otherwise; PPC <= === !> which is wrong for NaN
+		{name: "GreaterThan", argLength: 1, reg: crgp},   // bool, true flags encode  x>y false otherwise.
+		{name: "FGreaterThan", argLength: 1, reg: crgp},  // bool, true flags encode  x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: crgp},  // bool, true flags encode  x>=y false otherwise.
+		{name: "FGreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode  x>=y false otherwise.; PPC >= === !< which is wrong for NaN
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of the closure pointer.
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{ctxt}}},
+
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, clobberFlags: true},
+
+		// Convert pointer to integer, takes a memory operand for ordering.
+		{name: "MOVDconvert", argLength: 2, reg: gp11, asm: "MOVD"},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                      // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gp | sp, ctxt, 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                        // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                           // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                 // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// large or unaligned zeroing
+		// arg0 = address of memory to zero (in R3, changed as side effect)
+		// arg1 = address of the last element to zero
+		// arg2 = mem
+		// returns mem
+		//  ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
+		//	MOVDU	R0, 8(R3)
+		//	CMP	R3, Rarg1
+		//	BLE	-2(PC)
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R3"), gp},
+				clobbers: buildReg("R3"),
+			},
+			clobberFlags: true,
+			typ:          "Mem",
+		},
+
+		// large or unaligned move
+		// arg0 = address of dst memory (in R3, changed as side effect)
+		// arg1 = address of src memory (in R4, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// returns mem
+		//  ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
+		//  ADD -8,R4,R4 // intermediate value not valid GC ptr, cannot expose to opt+GC
+		//	MOVDU	8(R4), Rtmp
+		//	MOVDU	Rtmp, 8(R3)
+		//	CMP	R4, Rarg2
+		//	BLT	-3(PC)
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R3"), buildReg("R4"), gp},
+				clobbers: buildReg("R3 R4"),
+			},
+			clobberFlags: true,
+			typ:          "Mem",
+		},
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// So if we want (LessThan (CMP a b)) but we can't do that because a is a constant,
+		// then we do (LessThan (InvertFlags (CMP b a))) instead.
+		// Rewrites will convert this to (GreaterThan (CMP b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Constant flag values. For any comparison, there are 3 possible
+		// outcomes: either the three from the signed total order (<,==,>)
+		// or the three from the unsigned total order, depending on which
+		// comparison operation was used (CMP or CMPU -- PPC is different from
+		// the other architectures, which have a single comparison producing
+		// both signed and unsigned comparison results.)
+
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"}, // equal
+		{name: "FlagLT"}, // signed < or unsigned <
+		{name: "FlagGT"}, // signed > or unsigned >
+
+	}
+
+	blocks := []blockData{
+		{name: "EQ"},
+		{name: "NE"},
+		{name: "LT"},
+		{name: "LE"},
+		{name: "GT"},
+		{name: "GE"},
+		{name: "FLT"},
+		{name: "FLE"},
+		{name: "FGT"},
+		{name: "FGE"},
+	}
+
+	archs = append(archs, arch{
+		name:            "PPC64",
+		pkg:             "cmd/internal/obj/ppc64",
+		genfile:         "../../ppc64/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesPPC64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["SP"]),
+	})
+}
--- a/src/cmd/compile/internal/ssa/gen/dec64.rules
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
@@ -0,0 +1,407 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules to decompose [u]int64 types on 32-bit
+// architectures. These rules work together with the decomposeBuiltIn
+// pass which handles phis of these types.
+
+(Int64Hi (Int64Make hi _)) -> hi
+(Int64Lo (Int64Make _ lo)) -> lo
+
+// Assuming little endian (we don't support big endian 32-bit architecture yet)
+(Load <t> ptr mem) && is64BitInt(t) && t.IsSigned() ->
+	(Int64Make
+		(Load <config.fe.TypeInt32()> (OffPtr <config.fe.TypeInt32().PtrTo()> [4] ptr) mem)
+		(Load <config.fe.TypeUInt32()> ptr mem))
+(Load <t> ptr mem) && is64BitInt(t) && !t.IsSigned() ->
+	(Int64Make
+		(Load <config.fe.TypeUInt32()> (OffPtr <config.fe.TypeUInt32().PtrTo()> [4] ptr) mem)
+		(Load <config.fe.TypeUInt32()> ptr mem))
+
+(Store [8] dst (Int64Make hi lo) mem) ->
+	(Store [4]
+		(OffPtr <hi.Type.PtrTo()> [4] dst)
+		hi
+		(Store [4] dst lo mem))
+
+(Arg {n} [off]) && is64BitInt(v.Type) && v.Type.IsSigned() ->
+  (Int64Make
+    (Arg <config.fe.TypeInt32()> {n} [off+4])
+    (Arg <config.fe.TypeUInt32()> {n} [off]))
+(Arg {n} [off]) && is64BitInt(v.Type) && !v.Type.IsSigned() ->
+  (Int64Make
+    (Arg <config.fe.TypeUInt32()> {n} [off+4])
+    (Arg <config.fe.TypeUInt32()> {n} [off]))
+
+(Add64 x y) ->
+	(Int64Make
+		(Add32withcarry <config.fe.TypeInt32()>
+			(Int64Hi x)
+			(Int64Hi y)
+			(Select0 <TypeFlags> (Add32carry (Int64Lo x) (Int64Lo y))))
+		(Select1 <config.fe.TypeUInt32()> (Add32carry (Int64Lo x) (Int64Lo y))))
+
+(Sub64 x y) ->
+	(Int64Make
+		(Sub32withcarry <config.fe.TypeInt32()>
+			(Int64Hi x)
+			(Int64Hi y)
+			(Select0 <TypeFlags> (Sub32carry (Int64Lo x) (Int64Lo y))))
+		(Select1 <config.fe.TypeUInt32()> (Sub32carry (Int64Lo x) (Int64Lo y))))
+
+(Mul64 x y) ->
+	(Int64Make
+		(Add32 <config.fe.TypeUInt32()>
+			(Mul32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Hi y))
+			(Add32 <config.fe.TypeUInt32()>
+				(Mul32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Lo y))
+				(Select0 <config.fe.TypeUInt32()> (Mul32uhilo (Int64Lo x) (Int64Lo y)))))
+		(Select1 <config.fe.TypeUInt32()> (Mul32uhilo (Int64Lo x) (Int64Lo y))))
+
+(And64 x y) ->
+	(Int64Make
+		(And32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Hi y))
+		(And32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Lo y)))
+
+(Or64 x y) ->
+	(Int64Make
+		(Or32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Hi y))
+		(Or32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Lo y)))
+
+(Xor64 x y) ->
+	(Int64Make
+		(Xor32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Hi y))
+		(Xor32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Lo y)))
+
+(Neg64 <t> x) -> (Sub64 (Const64 <t> [0]) x)
+
+(Com64 x) ->
+	(Int64Make
+		(Com32 <config.fe.TypeUInt32()> (Int64Hi x))
+		(Com32 <config.fe.TypeUInt32()> (Int64Lo x)))
+
+(SignExt32to64 x) -> (Int64Make (Signmask x) x)
+(SignExt16to64 x) -> (SignExt32to64 (SignExt16to32 x))
+(SignExt8to64 x) -> (SignExt32to64 (SignExt8to32 x))
+
+(ZeroExt32to64 x) -> (Int64Make (Const32 <config.fe.TypeUInt32()> [0]) x)
+(ZeroExt16to64 x) -> (ZeroExt32to64 (ZeroExt16to32 x))
+(ZeroExt8to64 x) -> (ZeroExt32to64 (ZeroExt8to32 x))
+
+(Trunc64to32 (Int64Make _ lo)) -> lo
+(Trunc64to16 (Int64Make _ lo)) -> (Trunc32to16 lo)
+(Trunc64to8 (Int64Make _ lo)) -> (Trunc32to8 lo)
+
+(Lsh32x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Rsh32x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Signmask x)
+(Rsh32Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Lsh16x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Rsh16x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Signmask (SignExt16to32 x))
+(Rsh16Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Lsh8x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Rsh8x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Signmask (SignExt8to32 x))
+(Rsh8Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+
+(Lsh32x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh32x32 x lo)
+(Rsh32x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh32x32 x lo)
+(Rsh32Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh32Ux32 x lo)
+(Lsh16x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh16x32 x lo)
+(Rsh16x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh16x32 x lo)
+(Rsh16Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh16Ux32 x lo)
+(Lsh8x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh8x32 x lo)
+(Rsh8x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh8x32 x lo)
+(Rsh8Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh8Ux32 x lo)
+
+(Lsh64x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const64 [0])
+(Rsh64x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Int64Make (Signmask (Int64Hi x)) (Signmask (Int64Hi x)))
+(Rsh64Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const64 [0])
+
+(Lsh64x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh64x32 x lo)
+(Rsh64x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh64x32 x lo)
+(Rsh64Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh64Ux32 x lo)
+
+// turn x64 non-constant shifts to x32 shifts
+// if high 32-bit of the shift is nonzero, make a huge shift
+(Lsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Lsh64x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh64x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh64Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh64Ux32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Lsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Lsh32x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh32x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh32Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh32Ux32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Lsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Lsh16x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh16x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh16Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh16Ux32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Lsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Lsh8x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh8x32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+(Rsh8Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 ->
+	(Rsh8Ux32 x (Or32 <config.fe.TypeUInt32()> (Zeromask hi) lo))
+
+// 64x left shift
+// result.hi = hi<<s | lo>>(32-s) | lo<<(s-32) // >> is unsigned, large shifts result 0
+// result.lo = lo<<s
+(Lsh64x32 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Lsh32x32 <config.fe.TypeUInt32()> hi s)
+				(Rsh32Ux32 <config.fe.TypeUInt32()>
+					lo
+					(Sub32 <config.fe.TypeUInt32()> (Const32 <config.fe.TypeUInt32()> [32]) s)))
+			(Lsh32x32 <config.fe.TypeUInt32()>
+				lo
+				(Sub32 <config.fe.TypeUInt32()> s (Const32 <config.fe.TypeUInt32()> [32]))))
+		(Lsh32x32 <config.fe.TypeUInt32()> lo s))
+(Lsh64x16 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Lsh32x16 <config.fe.TypeUInt32()> hi s)
+				(Rsh32Ux16 <config.fe.TypeUInt32()>
+					lo
+					(Sub16 <config.fe.TypeUInt16()> (Const16 <config.fe.TypeUInt16()> [32]) s)))
+			(Lsh32x16 <config.fe.TypeUInt32()>
+				lo
+				(Sub16 <config.fe.TypeUInt16()> s (Const16 <config.fe.TypeUInt16()> [32]))))
+		(Lsh32x16 <config.fe.TypeUInt32()> lo s))
+(Lsh64x8 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Lsh32x8 <config.fe.TypeUInt32()> hi s)
+				(Rsh32Ux8 <config.fe.TypeUInt32()>
+					lo
+					(Sub8 <config.fe.TypeUInt8()> (Const8 <config.fe.TypeUInt8()> [32]) s)))
+			(Lsh32x8 <config.fe.TypeUInt32()>
+				lo
+				(Sub8 <config.fe.TypeUInt8()> s (Const8 <config.fe.TypeUInt8()> [32]))))
+		(Lsh32x8 <config.fe.TypeUInt32()> lo s))
+
+// 64x unsigned right shift
+// result.hi = hi>>s
+// result.lo = lo>>s | hi<<(32-s) | hi>>(s-32) // >> is unsigned, large shifts result 0
+(Rsh64Ux32 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Rsh32Ux32 <config.fe.TypeUInt32()> hi s)
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Rsh32Ux32 <config.fe.TypeUInt32()> lo s)
+				(Lsh32x32 <config.fe.TypeUInt32()>
+					hi
+					(Sub32 <config.fe.TypeUInt32()> (Const32 <config.fe.TypeUInt32()> [32]) s)))
+			(Rsh32Ux32 <config.fe.TypeUInt32()>
+				hi
+				(Sub32 <config.fe.TypeUInt32()> s (Const32 <config.fe.TypeUInt32()> [32])))))
+(Rsh64Ux16 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Rsh32Ux16 <config.fe.TypeUInt32()> hi s)
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Rsh32Ux16 <config.fe.TypeUInt32()> lo s)
+				(Lsh32x16 <config.fe.TypeUInt32()>
+					hi
+					(Sub16 <config.fe.TypeUInt16()> (Const16 <config.fe.TypeUInt16()> [32]) s)))
+			(Rsh32Ux16 <config.fe.TypeUInt32()>
+				hi
+				(Sub16 <config.fe.TypeUInt16()> s (Const16 <config.fe.TypeUInt16()> [32])))))
+(Rsh64Ux8 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Rsh32Ux8 <config.fe.TypeUInt32()> hi s)
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Rsh32Ux8 <config.fe.TypeUInt32()> lo s)
+				(Lsh32x8 <config.fe.TypeUInt32()>
+					hi
+					(Sub8 <config.fe.TypeUInt8()> (Const8 <config.fe.TypeUInt8()> [32]) s)))
+			(Rsh32Ux8 <config.fe.TypeUInt32()>
+				hi
+				(Sub8 <config.fe.TypeUInt8()> s (Const8 <config.fe.TypeUInt8()> [32])))))
+
+// 64x signed right shift
+// result.hi = hi>>s
+// result.lo = lo>>s | hi<<(32-s) | (hi>>(s-32))&zeromask(s>>5) // hi>>(s-32) is signed, large shifts result 0/-1
+(Rsh64x32 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Rsh32x32 <config.fe.TypeUInt32()> hi s)
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Rsh32Ux32 <config.fe.TypeUInt32()> lo s)
+				(Lsh32x32 <config.fe.TypeUInt32()>
+					hi
+					(Sub32 <config.fe.TypeUInt32()> (Const32 <config.fe.TypeUInt32()> [32]) s)))
+			(And32 <config.fe.TypeUInt32()>
+				(Rsh32x32 <config.fe.TypeUInt32()>
+					hi
+					(Sub32 <config.fe.TypeUInt32()> s (Const32 <config.fe.TypeUInt32()> [32])))
+				(Zeromask
+					(Rsh32Ux32 <config.fe.TypeUInt32()> s (Const32 <config.fe.TypeUInt32()> [5]))))))
+(Rsh64x16 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Rsh32x16 <config.fe.TypeUInt32()> hi s)
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Rsh32Ux16 <config.fe.TypeUInt32()> lo s)
+				(Lsh32x16 <config.fe.TypeUInt32()>
+					hi
+					(Sub16 <config.fe.TypeUInt16()> (Const16 <config.fe.TypeUInt16()> [32]) s)))
+			(And32 <config.fe.TypeUInt32()>
+				(Rsh32x16 <config.fe.TypeUInt32()>
+					hi
+					(Sub16 <config.fe.TypeUInt16()> s (Const16 <config.fe.TypeUInt16()> [32])))
+				(Zeromask
+					(ZeroExt16to32
+						(Rsh16Ux32 <config.fe.TypeUInt16()> s (Const32 <config.fe.TypeUInt32()> [5])))))))
+(Rsh64x8 (Int64Make hi lo) s) ->
+	(Int64Make
+		(Rsh32x8 <config.fe.TypeUInt32()> hi s)
+		(Or32 <config.fe.TypeUInt32()>
+			(Or32 <config.fe.TypeUInt32()>
+				(Rsh32Ux8 <config.fe.TypeUInt32()> lo s)
+				(Lsh32x8 <config.fe.TypeUInt32()>
+					hi
+					(Sub8 <config.fe.TypeUInt8()> (Const8 <config.fe.TypeUInt8()> [32]) s)))
+			(And32 <config.fe.TypeUInt32()>
+				(Rsh32x8 <config.fe.TypeUInt32()>
+					hi
+					(Sub8 <config.fe.TypeUInt8()> s (Const8 <config.fe.TypeUInt8()> [32])))
+				(Zeromask
+					(ZeroExt8to32
+						(Rsh8Ux32 <config.fe.TypeUInt8()> s (Const32 <config.fe.TypeUInt32()> [5])))))))
+
+// 64xConst32 shifts
+// we probably do not need them -- lateopt may take care of them just fine
+//(Lsh64x32 _ (Const32 [c])) && uint32(c) >= 64 -> (Const64 [0])
+//(Rsh64x32 x (Const32 [c])) && uint32(c) >= 64 -> (Int64Make (Signmask (Int64Hi x)) (Signmask (Int64Hi x)))
+//(Rsh64Ux32 _ (Const32 [c])) && uint32(c) >= 64 -> (Const64 [0])
+//
+//(Lsh64x32 x (Const32 [c])) && c < 64 && c > 32 ->
+//	(Int64Make
+//		(Lsh32x32 <config.fe.TypeUInt32()> (Int64Lo x) (Const32 <config.fe.TypeUInt32()> [c-32]))
+//		(Const32 <config.fe.TypeUInt32()> [0]))
+//(Rsh64x32 x (Const32 [c])) && c < 64 && c > 32 ->
+//	(Int64Make
+//		(Signmask (Int64Hi x))
+//		(Rsh32x32 <config.fe.TypeInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [c-32])))
+//(Rsh64Ux32 x (Const32 [c])) && c < 64 && c > 32 ->
+//	(Int64Make
+//		(Const32 <config.fe.TypeUInt32()> [0])
+//		(Rsh32Ux32 <config.fe.TypeUInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [c-32])))
+//
+//(Lsh64x32 x (Const32 [32])) -> (Int64Make (Int64Lo x) (Const32 <config.fe.TypeUInt32()> [0]))
+//(Rsh64x32 x (Const32 [32])) -> (Int64Make (Signmask (Int64Hi x)) (Int64Hi x))
+//(Rsh64Ux32 x (Const32 [32])) -> (Int64Make (Const32 <config.fe.TypeUInt32()> [0]) (Int64Hi x))
+//
+//(Lsh64x32 x (Const32 [c])) && c < 32 && c > 0 ->
+//	(Int64Make
+//		(Or32 <config.fe.TypeUInt32()>
+//			(Lsh32x32 <config.fe.TypeUInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [c]))
+//			(Rsh32Ux32 <config.fe.TypeUInt32()> (Int64Lo x) (Const32 <config.fe.TypeUInt32()> [32-c])))
+//		(Lsh32x32 <config.fe.TypeUInt32()> (Int64Lo x) (Const32 <config.fe.TypeUInt32()> [c])))
+//(Rsh64x32 x (Const32 [c])) && c < 32 && c > 0 ->
+//	(Int64Make
+//		(Rsh32x32 <config.fe.TypeInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [c]))
+//		(Or32 <config.fe.TypeUInt32()>
+//			(Rsh32Ux32 <config.fe.TypeUInt32()> (Int64Lo x) (Const32 <config.fe.TypeUInt32()> [c]))
+//			(Lsh32x32 <config.fe.TypeUInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [32-c]))))
+//(Rsh64Ux32 x (Const32 [c])) && c < 32 && c > 0 ->
+//	(Int64Make
+//		(Rsh32Ux32 <config.fe.TypeUInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [c]))
+//		(Or32 <config.fe.TypeUInt32()>
+//			(Rsh32Ux32 <config.fe.TypeUInt32()> (Int64Lo x) (Const32 <config.fe.TypeUInt32()> [c]))
+//			(Lsh32x32 <config.fe.TypeUInt32()> (Int64Hi x) (Const32 <config.fe.TypeUInt32()> [32-c]))))
+//
+//(Lsh64x32 x (Const32 [0])) -> x
+//(Rsh64x32 x (Const32 [0])) -> x
+//(Rsh64Ux32 x (Const32 [0])) -> x
+
+(Lrot64 (Int64Make hi lo) [c]) && c <= 32 ->
+	(Int64Make
+		(Or32 <config.fe.TypeUInt32()>
+			(Lsh32x32 <config.fe.TypeUInt32()> hi (Const32 <config.fe.TypeUInt32()> [c]))
+			(Rsh32Ux32 <config.fe.TypeUInt32()> lo (Const32 <config.fe.TypeUInt32()> [32-c])))
+		(Or32 <config.fe.TypeUInt32()>
+			(Lsh32x32 <config.fe.TypeUInt32()> lo (Const32 <config.fe.TypeUInt32()> [c]))
+			(Rsh32Ux32 <config.fe.TypeUInt32()> hi (Const32 <config.fe.TypeUInt32()> [32-c]))))
+(Lrot64 (Int64Make hi lo) [c]) && c > 32 -> (Lrot64 (Int64Make lo hi) [c-32])
+
+(Const64 <t> [c]) && t.IsSigned() ->
+	(Int64Make (Const32 <config.fe.TypeInt32()> [c>>32]) (Const32 <config.fe.TypeUInt32()> [int64(int32(c))]))
+(Const64 <t> [c]) && !t.IsSigned() ->
+	(Int64Make (Const32 <config.fe.TypeUInt32()> [c>>32]) (Const32 <config.fe.TypeUInt32()> [int64(int32(c))]))
+
+(Eq64 x y) ->
+	(AndB
+		(Eq32 (Int64Hi x) (Int64Hi y))
+		(Eq32 (Int64Lo x) (Int64Lo y)))
+
+(Neq64 x y) ->
+	(OrB
+		(Neq32 (Int64Hi x) (Int64Hi y))
+		(Neq32 (Int64Lo x) (Int64Lo y)))
+
+(Less64U x y) ->
+	(OrB
+		(Less32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64U x y) ->
+	(OrB
+		(Less32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Leq32U (Int64Lo x) (Int64Lo y))))
+
+(Greater64U x y) ->
+	(OrB
+		(Greater32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Greater32U (Int64Lo x) (Int64Lo y))))
+
+(Geq64U x y) ->
+	(OrB
+		(Greater32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Geq32U (Int64Lo x) (Int64Lo y))))
+
+(Less64 x y) ->
+	(OrB
+		(Less32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64 x y) ->
+	(OrB
+		(Less32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Leq32U (Int64Lo x) (Int64Lo y))))
+
+(Greater64 x y) ->
+	(OrB
+		(Greater32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Greater32U (Int64Lo x) (Int64Lo y))))
+
+(Geq64 x y) ->
+	(OrB
+		(Greater32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Geq32U (Int64Lo x) (Int64Lo y))))
--- a/src/cmd/compile/internal/ssa/gen/dec64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/dec64Ops.go
@@ -0,0 +1,20 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+var dec64Ops = []opData{}
+
+var dec64Blocks = []blockData{}
+
+func init() {
+	archs = append(archs, arch{
+		name:    "dec64",
+		ops:     dec64Ops,
+		blocks:  dec64Blocks,
+		generic: true,
+	})
+}
--- a/src/cmd/compile/internal/ssa/gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@@ -67,6 +67,12 @@
        (Const32F [f2i(float64(i2f32(c) * i2f32(d)))])
 (Mul64F (Const64F [c]) (Const64F [d])) -> (Const64F [f2i(i2f(c) * i2f(d))])

+// Convert x * -1 to -x. The front-end catches some but not all of these.
+(Mul8  (Const8  [-1]) x) -> (Neg8  x)
+(Mul16 (Const16 [-1]) x) -> (Neg16 x)
+(Mul32 (Const32 [-1]) x) -> (Neg32 x)
+(Mul64 (Const64 [-1]) x) -> (Neg64 x)
+
 (Mod8  (Const8  [c]) (Const8  [d])) && d != 0 -> (Const8  [int64(int8(c % d))])
 (Mod16 (Const16 [c]) (Const16 [d])) && d != 0 -> (Const16 [int64(int16(c % d))])
 (Mod32 (Const32 [c]) (Const32 [d])) && d != 0 -> (Const32 [int64(int32(c % d))])
@@ -625,8 +631,10 @@
        (Store [t.FieldType(0).Size()] dst f0 mem))))

 // un-SSAable values use mem->mem copies
-(Store [size] dst (Load <t> src mem) mem) && !config.fe.CanSSA(t) -> (Move [size] dst src mem)
-(Store [size] dst (Load <t> src mem) (VarDef {x} mem)) && !config.fe.CanSSA(t) -> (Move [size] dst src (VarDef {x} mem))
+(Store [size] dst (Load <t> src mem) mem) && !config.fe.CanSSA(t) ->
+	(Move [MakeSizeAndAlign(size, t.Alignment()).Int64()] dst src mem)
+(Store [size] dst (Load <t> src mem) (VarDef {x} mem)) && !config.fe.CanSSA(t) ->
+	(Move [MakeSizeAndAlign(size, t.Alignment()).Int64()] dst src (VarDef {x} mem))

 // string ops
 // Decomposing StringMake and lowering of StringPtr and StringLen
@@ -832,3 +840,23 @@
  -> (Sub64 x (Mul64 <t> (Div64  <t> x (Const64 <t> [c])) (Const64 <t> [c])))
 (Mod64u <t> x (Const64 [c])) && x.Op != OpConst64 && umagic64ok(c)
  -> (Sub64 x (Mul64 <t> (Div64u <t> x (Const64 <t> [c])) (Const64 <t> [c])))
+
+// floating point optimizations
+(Add32F x (Const32F [0])) -> x
+(Add32F (Const32F [0]) x) -> x
+(Add64F x (Const64F [0])) -> x
+(Add64F (Const64F [0]) x) -> x
+(Sub32F x (Const32F [0])) -> x
+(Sub64F x (Const64F [0])) -> x
+(Mul32F x (Const32F [f2i(1)])) -> x
+(Mul32F (Const32F [f2i(1)]) x) -> x
+(Mul64F x (Const64F [f2i(1)])) -> x
+(Mul64F (Const64F [f2i(1)]) x) -> x
+(Mul32F x (Const32F [f2i(-1)])) -> (Neg32F x)
+(Mul32F (Const32F [f2i(-1)]) x) -> (Neg32F x)
+(Mul64F x (Const64F [f2i(-1)])) -> (Neg64F x)
+(Mul64F (Const64F [f2i(-1)]) x) -> (Neg64F x)
+(Div32F x (Const32F [f2i(1)])) -> x
+(Div64F x (Const64F [f2i(1)])) -> x
+(Div32F x (Const32F [f2i(-1)])) -> (Neg32F x)
+(Div64F x (Const64F [f2i(-1)])) -> (Neg32F x)
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -173,76 +173,76 @@ var genericOps = []opData{
 	{name: "Lrot64", argLength: 1, aux: "Int64"},

 	// 2-input comparisons
-	{name: "Eq8", argLength: 2, commutative: true}, // arg0 == arg1
-	{name: "Eq16", argLength: 2, commutative: true},
-	{name: "Eq32", argLength: 2, commutative: true},
-	{name: "Eq64", argLength: 2, commutative: true},
-	{name: "EqPtr", argLength: 2, commutative: true},
-	{name: "EqInter", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "EqSlice", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "Eq32F", argLength: 2},
-	{name: "Eq64F", argLength: 2},
+	{name: "Eq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1
+	{name: "Eq16", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq32", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq64", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "EqPtr", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "EqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "EqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "Eq32F", argLength: 2, typ: "Bool"},
+	{name: "Eq64F", argLength: 2, typ: "Bool"},

-	{name: "Neq8", argLength: 2, commutative: true}, // arg0 != arg1
-	{name: "Neq16", argLength: 2, commutative: true},
-	{name: "Neq32", argLength: 2, commutative: true},
-	{name: "Neq64", argLength: 2, commutative: true},
-	{name: "NeqPtr", argLength: 2, commutative: true},
-	{name: "NeqInter", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "NeqSlice", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "Neq32F", argLength: 2},
+	{name: "Neq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1
+	{name: "Neq16", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq32", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq64", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "NeqPtr", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "NeqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "NeqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "Neq32F", argLength: 2, typ: "Bool"},
 	{name: "Neq64F", argLength: 2},

-	{name: "Less8", argLength: 2},  // arg0 < arg1, signed
-	{name: "Less8U", argLength: 2}, // arg0 < arg1, unsigned
-	{name: "Less16", argLength: 2},
-	{name: "Less16U", argLength: 2},
-	{name: "Less32", argLength: 2},
-	{name: "Less32U", argLength: 2},
-	{name: "Less64", argLength: 2},
-	{name: "Less64U", argLength: 2},
-	{name: "Less32F", argLength: 2},
-	{name: "Less64F", argLength: 2},
+	{name: "Less8", argLength: 2, typ: "Bool"},  // arg0 < arg1, signed
+	{name: "Less8U", argLength: 2, typ: "Bool"}, // arg0 < arg1, unsigned
+	{name: "Less16", argLength: 2, typ: "Bool"},
+	{name: "Less16U", argLength: 2, typ: "Bool"},
+	{name: "Less32", argLength: 2, typ: "Bool"},
+	{name: "Less32U", argLength: 2, typ: "Bool"},
+	{name: "Less64", argLength: 2, typ: "Bool"},
+	{name: "Less64U", argLength: 2, typ: "Bool"},
+	{name: "Less32F", argLength: 2, typ: "Bool"},
+	{name: "Less64F", argLength: 2, typ: "Bool"},

-	{name: "Leq8", argLength: 2},  // arg0 <= arg1, signed
-	{name: "Leq8U", argLength: 2}, // arg0 <= arg1, unsigned
-	{name: "Leq16", argLength: 2},
-	{name: "Leq16U", argLength: 2},
-	{name: "Leq32", argLength: 2},
-	{name: "Leq32U", argLength: 2},
-	{name: "Leq64", argLength: 2},
-	{name: "Leq64U", argLength: 2},
-	{name: "Leq32F", argLength: 2},
-	{name: "Leq64F", argLength: 2},
+	{name: "Leq8", argLength: 2, typ: "Bool"},  // arg0 <= arg1, signed
+	{name: "Leq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned
+	{name: "Leq16", argLength: 2, typ: "Bool"},
+	{name: "Leq16U", argLength: 2, typ: "Bool"},
+	{name: "Leq32", argLength: 2, typ: "Bool"},
+	{name: "Leq32U", argLength: 2, typ: "Bool"},
+	{name: "Leq64", argLength: 2, typ: "Bool"},
+	{name: "Leq64U", argLength: 2, typ: "Bool"},
+	{name: "Leq32F", argLength: 2, typ: "Bool"},
+	{name: "Leq64F", argLength: 2, typ: "Bool"},

-	{name: "Greater8", argLength: 2},  // arg0 > arg1, signed
-	{name: "Greater8U", argLength: 2}, // arg0 > arg1, unsigned
-	{name: "Greater16", argLength: 2},
-	{name: "Greater16U", argLength: 2},
-	{name: "Greater32", argLength: 2},
-	{name: "Greater32U", argLength: 2},
-	{name: "Greater64", argLength: 2},
-	{name: "Greater64U", argLength: 2},
-	{name: "Greater32F", argLength: 2},
-	{name: "Greater64F", argLength: 2},
+	{name: "Greater8", argLength: 2, typ: "Bool"},  // arg0 > arg1, signed
+	{name: "Greater8U", argLength: 2, typ: "Bool"}, // arg0 > arg1, unsigned
+	{name: "Greater16", argLength: 2, typ: "Bool"},
+	{name: "Greater16U", argLength: 2, typ: "Bool"},
+	{name: "Greater32", argLength: 2, typ: "Bool"},
+	{name: "Greater32U", argLength: 2, typ: "Bool"},
+	{name: "Greater64", argLength: 2, typ: "Bool"},
+	{name: "Greater64U", argLength: 2, typ: "Bool"},
+	{name: "Greater32F", argLength: 2, typ: "Bool"},
+	{name: "Greater64F", argLength: 2, typ: "Bool"},

-	{name: "Geq8", argLength: 2},  // arg0 <= arg1, signed
-	{name: "Geq8U", argLength: 2}, // arg0 <= arg1, unsigned
-	{name: "Geq16", argLength: 2},
-	{name: "Geq16U", argLength: 2},
-	{name: "Geq32", argLength: 2},
-	{name: "Geq32U", argLength: 2},
-	{name: "Geq64", argLength: 2},
-	{name: "Geq64U", argLength: 2},
-	{name: "Geq32F", argLength: 2},
-	{name: "Geq64F", argLength: 2},
+	{name: "Geq8", argLength: 2, typ: "Bool"},  // arg0 <= arg1, signed
+	{name: "Geq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned
+	{name: "Geq16", argLength: 2, typ: "Bool"},
+	{name: "Geq16U", argLength: 2, typ: "Bool"},
+	{name: "Geq32", argLength: 2, typ: "Bool"},
+	{name: "Geq32U", argLength: 2, typ: "Bool"},
+	{name: "Geq64", argLength: 2, typ: "Bool"},
+	{name: "Geq64U", argLength: 2, typ: "Bool"},
+	{name: "Geq32F", argLength: 2, typ: "Bool"},
+	{name: "Geq64F", argLength: 2, typ: "Bool"},

 	// boolean ops
-	{name: "AndB", argLength: 2}, // arg0 && arg1 (not shortcircuited)
-	{name: "OrB", argLength: 2},  // arg0 || arg1 (not shortcircuited)
-	{name: "EqB", argLength: 2},  // arg0 == arg1
-	{name: "NeqB", argLength: 2}, // arg0 != arg1
-	{name: "Not", argLength: 1},  // !arg0, boolean
+	{name: "AndB", argLength: 2, typ: "Bool"}, // arg0 && arg1 (not shortcircuited)
+	{name: "OrB", argLength: 2, typ: "Bool"},  // arg0 || arg1 (not shortcircuited)
+	{name: "EqB", argLength: 2, typ: "Bool"},  // arg0 == arg1
+	{name: "NeqB", argLength: 2, typ: "Bool"}, // arg0 != arg1
+	{name: "Not", argLength: 1, typ: "Bool"},  // !arg0, boolean

 	// 1-input ops
 	{name: "Neg8", argLength: 1}, // -arg0
@@ -312,8 +312,8 @@ var genericOps = []opData{
 	// Memory operations
 	{name: "Load", argLength: 2},                            // Load from arg0.  arg1=memory
 	{name: "Store", argLength: 3, typ: "Mem", aux: "Int64"}, // Store arg1 to arg0.  arg2=memory, auxint=size.  Returns memory.
-	{name: "Move", argLength: 3, aux: "Int64"},              // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size.  Returns memory.
-	{name: "Zero", argLength: 2, aux: "Int64"},              // arg0=destptr, arg1=mem, auxint=size. Returns memory.
+	{name: "Move", argLength: 3, typ: "Mem", aux: "Int64"},  // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size.  Returns memory.
+	{name: "Zero", argLength: 2, typ: "Mem", aux: "Int64"},  // arg0=destptr, arg1=mem, auxint=size. Returns memory.

 	// Function calls. Arguments to the call have already been written to the stack.
 	// Return values appear on the stack. The method receiver, if any, is treated
@@ -326,17 +326,17 @@ var genericOps = []opData{

 	// Conversions: signed extensions, zero (unsigned) extensions, truncations
 	{name: "SignExt8to16", argLength: 1, typ: "Int16"},
-	{name: "SignExt8to32", argLength: 1},
-	{name: "SignExt8to64", argLength: 1},
-	{name: "SignExt16to32", argLength: 1},
-	{name: "SignExt16to64", argLength: 1},
-	{name: "SignExt32to64", argLength: 1},
+	{name: "SignExt8to32", argLength: 1, typ: "Int32"},
+	{name: "SignExt8to64", argLength: 1, typ: "Int64"},
+	{name: "SignExt16to32", argLength: 1, typ: "Int32"},
+	{name: "SignExt16to64", argLength: 1, typ: "Int64"},
+	{name: "SignExt32to64", argLength: 1, typ: "Int64"},
 	{name: "ZeroExt8to16", argLength: 1, typ: "UInt16"},
-	{name: "ZeroExt8to32", argLength: 1},
-	{name: "ZeroExt8to64", argLength: 1},
-	{name: "ZeroExt16to32", argLength: 1},
-	{name: "ZeroExt16to64", argLength: 1},
-	{name: "ZeroExt32to64", argLength: 1},
+	{name: "ZeroExt8to32", argLength: 1, typ: "UInt32"},
+	{name: "ZeroExt8to64", argLength: 1, typ: "UInt64"},
+	{name: "ZeroExt16to32", argLength: 1, typ: "UInt32"},
+	{name: "ZeroExt16to64", argLength: 1, typ: "UInt64"},
+	{name: "ZeroExt32to64", argLength: 1, typ: "UInt64"},
 	{name: "Trunc16to8", argLength: 1},
 	{name: "Trunc32to8", argLength: 1},
 	{name: "Trunc32to16", argLength: 1},
@@ -416,6 +416,31 @@ var genericOps = []opData{
 	{name: "VarKill", argLength: 1, aux: "Sym"},            // aux is a *gc.Node of a variable that is known to be dead.  arg0=mem, returns mem
 	{name: "VarLive", argLength: 1, aux: "Sym"},            // aux is a *gc.Node of a variable that must be kept live.  arg0=mem, returns mem
 	{name: "KeepAlive", argLength: 2, typ: "Mem"},          // arg[0] is a value that must be kept alive until this mark.  arg[1]=mem, returns mem
+
+	// Ops for breaking 64-bit operations on 32-bit architectures
+	{name: "Int64Make", argLength: 2, typ: "UInt64"}, // arg0=hi, arg1=lo
+	{name: "Int64Hi", argLength: 1, typ: "UInt32"},   // high 32-bit of arg0
+	{name: "Int64Lo", argLength: 1, typ: "UInt32"},   // low 32-bit of arg0
+
+	{name: "Add32carry", argLength: 2, commutative: true, typ: "(Flags,UInt32)"}, // arg0 + arg1, returns (carry, value)
+	{name: "Add32withcarry", argLength: 3, commutative: true},                    // arg0 + arg1 + arg2, arg2=carry (0 or 1)
+
+	{name: "Sub32carry", argLength: 2, typ: "(Flags,UInt32)"}, // arg0 - arg1, returns (carry, value)
+	{name: "Sub32withcarry", argLength: 3},                    // arg0 - arg1 - arg2, arg2=carry (0 or 1)
+
+	{name: "Mul32uhilo", argLength: 2, typ: "(UInt32,UInt32)"}, // arg0 * arg1, returns (hi, lo)
+
+	{name: "Signmask", argLength: 1, typ: "Int32"},  // 0 if arg0 >= 0, -1 if arg0 < 0
+	{name: "Zeromask", argLength: 1, typ: "UInt32"}, // 0 if arg0 == 0, 0xffffffff if arg0 != 0
+
+	{name: "Cvt32Uto32F", argLength: 1}, // uint32 -> float32, only used on 32-bit arch
+	{name: "Cvt32Uto64F", argLength: 1}, // uint32 -> float64, only used on 32-bit arch
+	{name: "Cvt32Fto32U", argLength: 1}, // float32 -> uint32, only used on 32-bit arch
+	{name: "Cvt64Fto32U", argLength: 1}, // float64 -> uint32, only used on 32-bit arch
+
+	// pseudo-ops for breaking Tuple
+	{name: "Select0", argLength: 1}, // the first component of a tuple
+	{name: "Select1", argLength: 1}, // the second component of a tuple
 }

 //     kind           control    successors       implicit exit
--- a/src/cmd/compile/internal/ssa/gen/main.go
+++ b/src/cmd/compile/internal/ssa/gen/main.go
@@ -21,13 +21,16 @@ import (
 )

 type arch struct {
-	name     string
-	pkg      string // obj package to import for this arch.
-	genfile  string // source file containing opcode code generation.
-	ops      []opData
-	blocks   []blockData
-	regnames []string
-	generic  bool
+	name            string
+	pkg             string // obj package to import for this arch.
+	genfile         string // source file containing opcode code generation.
+	ops             []opData
+	blocks          []blockData
+	regnames        []string
+	gpregmask       regMask
+	fpregmask       regMask
+	framepointerreg int8
+	generic         bool
 }

 type opData struct {
@@ -38,8 +41,9 @@ type opData struct {
 	aux               string
 	rematerializeable bool
 	argLength         int32 // number of arguments, if -1, then this operation has a variable number of arguments
-	commutative       bool  // this operation is commutative (e.g. addition)
-	resultInArg0      bool  // v and v.Args[0] must be allocated to the same register
+	commutative       bool  // this operation is commutative on its first 2 arguments (e.g. addition)
+	resultInArg0      bool  // last output of v and v.Args[0] must be allocated to the same register
+	clobberFlags      bool  // this op clobbers flags register
 }

 type blockData struct {
@@ -73,6 +77,7 @@ var archs []arch

 func main() {
 	flag.Parse()
+	sort.Sort(ArchsByName(archs))
 	genOp()
 	genLower()
 }
@@ -155,13 +160,16 @@ func genOp() {
 			}
 			if v.resultInArg0 {
 				fmt.Fprintln(w, "resultInArg0: true,")
-				if v.reg.inputs[0] != v.reg.outputs[0] {
-					log.Fatalf("input[0] and output registers must be equal for %s", v.name)
+				if v.reg.inputs[0] != v.reg.outputs[len(v.reg.outputs)-1] {
+					log.Fatalf("input[0] and last output register must be equal for %s", v.name)
 				}
-				if v.commutative && v.reg.inputs[1] != v.reg.outputs[0] {
-					log.Fatalf("input[1] and output registers must be equal for %s", v.name)
+				if v.commutative && v.reg.inputs[1] != v.reg.outputs[len(v.reg.outputs)-1] {
+					log.Fatalf("input[1] and last output register must be equal for %s", v.name)
 				}
 			}
+			if v.clobberFlags {
+				fmt.Fprintln(w, "clobberFlags: true,")
+			}
 			if a.name == "generic" {
 				fmt.Fprintln(w, "generic:true,")
 				fmt.Fprintln(w, "},") // close op
@@ -191,14 +199,22 @@ func genOp() {
 				}
 				fmt.Fprintln(w, "},")
 			}
+
 			if v.reg.clobbers > 0 {
 				fmt.Fprintf(w, "clobbers: %d,%s\n", v.reg.clobbers, a.regMaskComment(v.reg.clobbers))
 			}
+
 			// reg outputs
-			if len(v.reg.outputs) > 0 {
-				fmt.Fprintln(w, "outputs: []regMask{")
-				for _, r := range v.reg.outputs {
-					fmt.Fprintf(w, "%d,%s\n", r, a.regMaskComment(r))
+			s = s[:0]
+			for i, r := range v.reg.outputs {
+				s = append(s, intPair{countRegs(r), i})
+			}
+			if len(s) > 0 {
+				sort.Sort(byKey(s))
+				fmt.Fprintln(w, "outputs: []outputInfo{")
+				for _, p := range s {
+					r := v.reg.outputs[p.val]
+					fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r))
 				}
 				fmt.Fprintln(w, "},")
 			}
@@ -223,6 +239,9 @@ func genOp() {
 			fmt.Fprintf(w, "  {%d, \"%s\"},\n", i, r)
 		}
 		fmt.Fprintln(w, "}")
+		fmt.Fprintf(w, "var gpRegMask%s = regMask(%d)\n", a.name, a.gpregmask)
+		fmt.Fprintf(w, "var fpRegMask%s = regMask(%d)\n", a.name, a.fpregmask)
+		fmt.Fprintf(w, "var framepointerReg%s = int8(%d)\n", a.name, a.framepointerreg)
 	}

 	// gofmt result
@@ -298,3 +317,9 @@ type byKey []intPair
 func (a byKey) Len() int           { return len(a) }
 func (a byKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a byKey) Less(i, j int) bool { return a[i].key < a[j].key }
+
+type ArchsByName []arch
+
+func (x ArchsByName) Len() int           { return len(x) }
+func (x ArchsByName) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x ArchsByName) Less(i, j int) bool { return x[i].name < x[j].name }
--- a/src/cmd/compile/internal/ssa/gen/rulegen.go
+++ b/src/cmd/compile/internal/ssa/gen/rulegen.go
@@ -117,15 +117,17 @@ func genRules(arch arch) {
 		if unbalanced(rule) {
 			continue
 		}
-		op := strings.Split(rule, " ")[0][1:]
-		if op[len(op)-1] == ')' {
-			op = op[:len(op)-1] // rule has only opcode, e.g. (ConstNil) -> ...
-		}
+
 		loc := fmt.Sprintf("%s.rules:%d", arch.name, ruleLineno)
-		if isBlock(op, arch) {
-			blockrules[op] = append(blockrules[op], Rule{rule: rule, loc: loc})
+		r := Rule{rule: rule, loc: loc}
+		if rawop := strings.Split(rule, " ")[0][1:]; isBlock(rawop, arch) {
+			blockrules[rawop] = append(blockrules[rawop], r)
 		} else {
-			oprules[op] = append(oprules[op], Rule{rule: rule, loc: loc})
+			// Do fancier value op matching.
+			match, _, _ := r.parse()
+			op, oparch, _, _, _, _ := parseValue(match, arch, loc)
+			opname := fmt.Sprintf("Op%s%s", oparch, op.name)
+			oprules[opname] = append(oprules[opname], r)
 		}
 		rule = ""
 		ruleLineno = 0
@@ -157,8 +159,8 @@ func genRules(arch arch) {
 	fmt.Fprintf(w, "func rewriteValue%s(v *Value, config *Config) bool {\n", arch.name)
 	fmt.Fprintf(w, "switch v.Op {\n")
 	for _, op := range ops {
-		fmt.Fprintf(w, "case %s:\n", opName(op, arch))
-		fmt.Fprintf(w, "return rewriteValue%s_%s(v, config)\n", arch.name, opName(op, arch))
+		fmt.Fprintf(w, "case %s:\n", op)
+		fmt.Fprintf(w, "return rewriteValue%s_%s(v, config)\n", arch.name, op)
 	}
 	fmt.Fprintf(w, "}\n")
 	fmt.Fprintf(w, "return false\n")
@@ -167,7 +169,7 @@ func genRules(arch arch) {
 	// Generate a routine per op. Note that we don't make one giant routine
 	// because it is too big for some compilers.
 	for _, op := range ops {
-		fmt.Fprintf(w, "func rewriteValue%s_%s(v *Value, config *Config) bool {\n", arch.name, opName(op, arch))
+		fmt.Fprintf(w, "func rewriteValue%s_%s(v *Value, config *Config) bool {\n", arch.name, op)
 		fmt.Fprintln(w, "b := v.Block")
 		fmt.Fprintln(w, "_ = b")
 		var canFail bool
@@ -334,141 +336,108 @@ func genMatch0(w io.Writer, arch arch, match, v string, m map[string]struct{}, t
 	}
 	canFail := false

-	// split body up into regions. Split by spaces/tabs, except those
-	// contained in () or {}.
-	s := split(match[1 : len(match)-1]) // remove parens, then split
-
-	// Find op record
-	var op opData
-	for _, x := range genericOps {
-		if x.name == s[0] {
-			op = x
-			break
-		}
-	}
-	for _, x := range arch.ops {
-		if x.name == s[0] {
-			op = x
-			break
-		}
-	}
-	if op.name == "" {
-		log.Fatalf("%s: unknown op %s", loc, s[0])
-	}
+	op, oparch, typ, auxint, aux, args := parseValue(match, arch, loc)

 	// check op
 	if !top {
-		fmt.Fprintf(w, "if %s.Op != %s {\nbreak\n}\n", v, opName(s[0], arch))
+		fmt.Fprintf(w, "if %s.Op != Op%s%s {\nbreak\n}\n", v, oparch, op.name)
 		canFail = true
 	}

-	// check type/aux/args
-	argnum := 0
-	for _, a := range s[1:] {
-		if a[0] == '<' {
-			// type restriction
-			t := a[1 : len(a)-1] // remove <>
-			if !isVariable(t) {
-				// code. We must match the results of this code.
-				fmt.Fprintf(w, "if %s.Type != %s {\nbreak\n}\n", v, t)
+	if typ != "" {
+		if !isVariable(typ) {
+			// code. We must match the results of this code.
+			fmt.Fprintf(w, "if %s.Type != %s {\nbreak\n}\n", v, typ)
+			canFail = true
+		} else {
+			// variable
+			if _, ok := m[typ]; ok {
+				// must match previous variable
+				fmt.Fprintf(w, "if %s.Type != %s {\nbreak\n}\n", v, typ)
 				canFail = true
 			} else {
-				// variable
-				if _, ok := m[t]; ok {
-					// must match previous variable
-					fmt.Fprintf(w, "if %s.Type != %s {\nbreak\n}\n", v, t)
-					canFail = true
-				} else {
-					m[t] = struct{}{}
-					fmt.Fprintf(w, "%s := %s.Type\n", t, v)
-				}
+				m[typ] = struct{}{}
+				fmt.Fprintf(w, "%s := %s.Type\n", typ, v)
 			}
-		} else if a[0] == '[' {
-			// auxint restriction
-			switch op.aux {
-			case "Bool", "Int8", "Int16", "Int32", "Int64", "Int128", "Float32", "Float64", "SymOff", "SymValAndOff", "SymInt32":
-			default:
-				log.Fatalf("%s: op %s %s can't have auxint", loc, op.name, op.aux)
-			}
-			x := a[1 : len(a)-1] // remove []
-			if !isVariable(x) {
-				// code
-				fmt.Fprintf(w, "if %s.AuxInt != %s {\nbreak\n}\n", v, x)
+		}
+	}
+
+	if auxint != "" {
+		if !isVariable(auxint) {
+			// code
+			fmt.Fprintf(w, "if %s.AuxInt != %s {\nbreak\n}\n", v, auxint)
+			canFail = true
+		} else {
+			// variable
+			if _, ok := m[auxint]; ok {
+				fmt.Fprintf(w, "if %s.AuxInt != %s {\nbreak\n}\n", v, auxint)
 				canFail = true
 			} else {
-				// variable
-				if _, ok := m[x]; ok {
-					fmt.Fprintf(w, "if %s.AuxInt != %s {\nbreak\n}\n", v, x)
-					canFail = true
-				} else {
-					m[x] = struct{}{}
-					fmt.Fprintf(w, "%s := %s.AuxInt\n", x, v)
-				}
+				m[auxint] = struct{}{}
+				fmt.Fprintf(w, "%s := %s.AuxInt\n", auxint, v)
 			}
-		} else if a[0] == '{' {
-			// aux restriction
-			switch op.aux {
-			case "String", "Sym", "SymOff", "SymValAndOff", "SymInt32":
-			default:
-				log.Fatalf("%s: op %s %s can't have aux", loc, op.name, op.aux)
-			}
-			x := a[1 : len(a)-1] // remove {}
-			if !isVariable(x) {
-				// code
-				fmt.Fprintf(w, "if %s.Aux != %s {\nbreak\n}\n", v, x)
+		}
+	}
+
+	if aux != "" {
+
+		if !isVariable(aux) {
+			// code
+			fmt.Fprintf(w, "if %s.Aux != %s {\nbreak\n}\n", v, aux)
+			canFail = true
+		} else {
+			// variable
+			if _, ok := m[aux]; ok {
+				fmt.Fprintf(w, "if %s.Aux != %s {\nbreak\n}\n", v, aux)
 				canFail = true
 			} else {
-				// variable
-				if _, ok := m[x]; ok {
-					fmt.Fprintf(w, "if %s.Aux != %s {\nbreak\n}\n", v, x)
-					canFail = true
-				} else {
-					m[x] = struct{}{}
-					fmt.Fprintf(w, "%s := %s.Aux\n", x, v)
-				}
+				m[aux] = struct{}{}
+				fmt.Fprintf(w, "%s := %s.Aux\n", aux, v)
 			}
-		} else if a == "_" {
-			argnum++
-		} else if !strings.Contains(a, "(") {
+		}
+	}
+
+	for i, arg := range args {
+		if arg == "_" {
+			continue
+		}
+		if !strings.Contains(arg, "(") {
 			// leaf variable
-			if _, ok := m[a]; ok {
+			if _, ok := m[arg]; ok {
 				// variable already has a definition. Check whether
 				// the old definition and the new definition match.
 				// For example, (add x x).  Equality is just pointer equality
 				// on Values (so cse is important to do before lowering).
-				fmt.Fprintf(w, "if %s != %s.Args[%d] {\nbreak\n}\n", a, v, argnum)
+				fmt.Fprintf(w, "if %s != %s.Args[%d] {\nbreak\n}\n", arg, v, i)
 				canFail = true
 			} else {
 				// remember that this variable references the given value
-				m[a] = struct{}{}
-				fmt.Fprintf(w, "%s := %s.Args[%d]\n", a, v, argnum)
+				m[arg] = struct{}{}
+				fmt.Fprintf(w, "%s := %s.Args[%d]\n", arg, v, i)
 			}
-			argnum++
+			continue
+		}
+		// compound sexpr
+		var argname string
+		colon := strings.Index(arg, ":")
+		openparen := strings.Index(arg, "(")
+		if colon >= 0 && openparen >= 0 && colon < openparen {
+			// rule-specified name
+			argname = arg[:colon]
+			arg = arg[colon+1:]
 		} else {
-			// compound sexpr
-			var argname string
-			colon := strings.Index(a, ":")
-			openparen := strings.Index(a, "(")
-			if colon >= 0 && openparen >= 0 && colon < openparen {
-				// rule-specified name
-				argname = a[:colon]
-				a = a[colon+1:]
-			} else {
-				// autogenerated name
-				argname = fmt.Sprintf("%s_%d", v, argnum)
-			}
-			fmt.Fprintf(w, "%s := %s.Args[%d]\n", argname, v, argnum)
-			if genMatch0(w, arch, a, argname, m, false, loc) {
-				canFail = true
-			}
-			argnum++
+			// autogenerated name
+			argname = fmt.Sprintf("%s_%d", v, i)
+		}
+		fmt.Fprintf(w, "%s := %s.Args[%d]\n", argname, v, i)
+		if genMatch0(w, arch, arg, argname, m, false, loc) {
+			canFail = true
 		}
 	}
+
 	if op.argLength == -1 {
-		fmt.Fprintf(w, "if len(%s.Args) != %d {\nbreak\n}\n", v, argnum)
+		fmt.Fprintf(w, "if len(%s.Args) != %d {\nbreak\n}\n", v, len(args))
 		canFail = true
-	} else if int(op.argLength) != argnum {
-		log.Fatalf("%s: op %s should have %d args, has %d", loc, op.name, op.argLength, argnum)
 	}
 	return canFail
 }
@@ -500,105 +469,44 @@ func genResult0(w io.Writer, arch arch, result string, alloc *int, top, move boo
 		return result
 	}

-	s := split(result[1 : len(result)-1]) // remove parens, then split
-
-	// Find op record
-	var op opData
-	for _, x := range genericOps {
-		if x.name == s[0] {
-			op = x
-			break
-		}
-	}
-	for _, x := range arch.ops {
-		if x.name == s[0] {
-			op = x
-			break
-		}
-	}
-	if op.name == "" {
-		log.Fatalf("%s: unknown op %s", loc, s[0])
-	}
+	op, oparch, typ, auxint, aux, args := parseValue(result, arch, loc)

 	// Find the type of the variable.
-	var opType string
-	var typeOverride bool
-	for _, a := range s[1:] {
-		if a[0] == '<' {
-			// type restriction
-			opType = a[1 : len(a)-1] // remove <>
-			typeOverride = true
-			break
-		}
-	}
-	if opType == "" {
-		// find default type, if any
-		for _, op := range arch.ops {
-			if op.name == s[0] && op.typ != "" {
-				opType = typeName(op.typ)
-				break
-			}
-		}
-	}
-	if opType == "" {
-		for _, op := range genericOps {
-			if op.name == s[0] && op.typ != "" {
-				opType = typeName(op.typ)
-				break
-			}
-		}
+	typeOverride := typ != ""
+	if typ == "" && op.typ != "" {
+		typ = typeName(op.typ)
 	}
+
 	var v string
 	if top && !move {
 		v = "v"
-		fmt.Fprintf(w, "v.reset(%s)\n", opName(s[0], arch))
+		fmt.Fprintf(w, "v.reset(Op%s%s)\n", oparch, op.name)
 		if typeOverride {
-			fmt.Fprintf(w, "v.Type = %s\n", opType)
+			fmt.Fprintf(w, "v.Type = %s\n", typ)
 		}
 	} else {
-		if opType == "" {
-			log.Fatalf("sub-expression %s (op=%s) must have a type", result, s[0])
+		if typ == "" {
+			log.Fatalf("sub-expression %s (op=Op%s%s) must have a type", result, oparch, op.name)
 		}
 		v = fmt.Sprintf("v%d", *alloc)
 		*alloc++
-		fmt.Fprintf(w, "%s := b.NewValue0(v.Line, %s, %s)\n", v, opName(s[0], arch), opType)
+		fmt.Fprintf(w, "%s := b.NewValue0(v.Line, Op%s%s, %s)\n", v, oparch, op.name, typ)
 		if move && top {
 			// Rewrite original into a copy
 			fmt.Fprintf(w, "v.reset(OpCopy)\n")
 			fmt.Fprintf(w, "v.AddArg(%s)\n", v)
 		}
 	}
-	argnum := 0
-	for _, a := range s[1:] {
-		if a[0] == '<' {
-			// type restriction, handled above
-		} else if a[0] == '[' {
-			// auxint restriction
-			switch op.aux {
-			case "Bool", "Int8", "Int16", "Int32", "Int64", "Int128", "Float32", "Float64", "SymOff", "SymValAndOff", "SymInt32":
-			default:
-				log.Fatalf("%s: op %s %s can't have auxint", loc, op.name, op.aux)
-			}
-			x := a[1 : len(a)-1] // remove []
-			fmt.Fprintf(w, "%s.AuxInt = %s\n", v, x)
-		} else if a[0] == '{' {
-			// aux restriction
-			switch op.aux {
-			case "String", "Sym", "SymOff", "SymValAndOff", "SymInt32":
-			default:
-				log.Fatalf("%s: op %s %s can't have aux", loc, op.name, op.aux)
-			}
-			x := a[1 : len(a)-1] // remove {}
-			fmt.Fprintf(w, "%s.Aux = %s\n", v, x)
-		} else {
-			// regular argument (sexpr or variable)
-			x := genResult0(w, arch, a, alloc, false, move, loc)
-			fmt.Fprintf(w, "%s.AddArg(%s)\n", v, x)
-			argnum++
-		}
+
+	if auxint != "" {
+		fmt.Fprintf(w, "%s.AuxInt = %s\n", v, auxint)
 	}
-	if op.argLength != -1 && int(op.argLength) != argnum {
-		log.Fatalf("%s: op %s should have %d args, has %d", loc, op.name, op.argLength, argnum)
+	if aux != "" {
+		fmt.Fprintf(w, "%s.Aux = %s\n", v, aux)
+	}
+	for _, arg := range args {
+		x := genResult0(w, arch, arg, alloc, false, move, loc)
+		fmt.Fprintf(w, "%s.AddArg(%s)\n", v, x)
 	}

 	return v
@@ -666,16 +574,102 @@ func isBlock(name string, arch arch) bool {
 	return false
 }

-// opName converts from an op name specified in a rule file to an Op enum.
-// if the name matches a generic op, returns "Op" plus the specified name.
-// Otherwise, returns "Op" plus arch name plus op name.
-func opName(name string, arch arch) string {
-	for _, op := range genericOps {
-		if op.name == name {
-			return "Op" + name
+// parseValue parses a parenthesized value from a rule.
+// The value can be from the match or the result side.
+// It returns the op and unparsed strings for typ, auxint, and aux restrictions and for all args.
+// oparch is the architecture that op is located in, or "" for generic.
+func parseValue(val string, arch arch, loc string) (op opData, oparch string, typ string, auxint string, aux string, args []string) {
+	val = val[1 : len(val)-1] // remove ()
+
+	// Split val up into regions.
+	// Split by spaces/tabs, except those contained in (), {}, [], or <>.
+	s := split(val)
+
+	// Extract restrictions and args.
+	for _, a := range s[1:] {
+		switch a[0] {
+		case '<':
+			typ = a[1 : len(a)-1] // remove <>
+		case '[':
+			auxint = a[1 : len(a)-1] // remove []
+		case '{':
+			aux = a[1 : len(a)-1] // remove {}
+		default:
+			args = append(args, a)
 		}
 	}
-	return "Op" + arch.name + name
+
+	// Resolve the op.
+
+	// match reports whether x is a good op to select.
+	// If strict is true, rule generation might succeed.
+	// If strict is false, rule generation has failed,
+	// but we're trying to generate a useful error.
+	// Doing strict=true then strict=false allows
+	// precise op matching while retaining good error messages.
+	match := func(x opData, strict bool, archname string) bool {
+		if x.name != s[0] {
+			return false
+		}
+		if x.argLength != -1 && int(x.argLength) != len(args) {
+			if strict {
+				return false
+			} else {
+				log.Printf("%s: op %s (%s) should have %d args, has %d", loc, s[0], archname, op.argLength, len(args))
+			}
+		}
+		return true
+	}
+
+	for _, x := range genericOps {
+		if match(x, true, "generic") {
+			op = x
+			break
+		}
+	}
+	if arch.name != "generic" {
+		for _, x := range arch.ops {
+			if match(x, true, arch.name) {
+				if op.name != "" {
+					log.Fatalf("%s: matches for op %s found in both generic and %s", loc, op.name, arch.name)
+				}
+				op = x
+				oparch = arch.name
+				break
+			}
+		}
+	}
+
+	if op.name == "" {
+		// Failed to find the op.
+		// Run through everything again with strict=false
+		// to generate useful diagnosic messages before failing.
+		for _, x := range genericOps {
+			match(x, false, "generic")
+		}
+		for _, x := range arch.ops {
+			match(x, false, arch.name)
+		}
+		log.Fatalf("%s: unknown op %s", loc, s)
+	}
+
+	// Sanity check aux, auxint.
+	if auxint != "" {
+		switch op.aux {
+		case "Bool", "Int8", "Int16", "Int32", "Int64", "Int128", "Float32", "Float64", "SymOff", "SymValAndOff", "SymInt32":
+		default:
+			log.Fatalf("%s: op %s %s can't have auxint", loc, op.name, op.aux)
+		}
+	}
+	if aux != "" {
+		switch op.aux {
+		case "String", "Sym", "SymOff", "SymValAndOff", "SymInt32":
+		default:
+			log.Fatalf("%s: op %s %s can't have aux", loc, op.name, op.aux)
+		}
+	}
+
+	return
 }

 func blockName(name string, arch arch) string {
@@ -689,6 +683,13 @@ func blockName(name string, arch arch) string {

 // typeName returns the string to use to generate a type.
 func typeName(typ string) string {
+	if typ[0] == '(' {
+		ts := strings.Split(typ[1:len(typ)-1], ",")
+		if len(ts) != 2 {
+			panic("Tuple expect 2 arguments")
+		}
+		return "MakeTuple(" + typeName(ts[0]) + ", " + typeName(ts[1]) + ")"
+	}
 	switch typ {
 	case "Flags", "Mem", "Void", "Int128":
 		return "Type" + typ
--- a/src/cmd/compile/internal/ssa/html.go
+++ b/src/cmd/compile/internal/ssa/html.go
@@ -359,7 +359,7 @@ func (v *Value) LongHTML() string {
 	}
 	r := v.Block.Func.RegAlloc
 	if int(v.ID) < len(r) && r[v.ID] != nil {
-		s += " : " + r[v.ID].Name()
+		s += " : " + html.EscapeString(r[v.ID].Name())
 	}
 	s += "</span>"
 	return s
--- a/src/cmd/compile/internal/ssa/location.go
+++ b/src/cmd/compile/internal/ssa/location.go
@@ -36,3 +36,16 @@ func (s LocalSlot) Name() string {
 	}
 	return fmt.Sprintf("%s+%d[%s]", s.N, s.Off, s.Type)
 }
+
+type LocPair [2]Location
+
+func (t LocPair) Name() string {
+	n0, n1 := "nil", "nil"
+	if t[0] != nil {
+		n0 = t[0].Name()
+	}
+	if t[1] != nil {
+		n1 = t[1].Name()
+	}
+	return fmt.Sprintf("<%s,%s>", n0, n1)
+}
--- a/src/cmd/compile/internal/ssa/lower.go
+++ b/src/cmd/compile/internal/ssa/lower.go
@@ -21,10 +21,15 @@ func checkLower(f *Func) {
 				continue // lowered
 			}
 			switch v.Op {
-			case OpSP, OpSB, OpInitMem, OpArg, OpPhi, OpVarDef, OpVarKill, OpVarLive, OpKeepAlive:
+			case OpSP, OpSB, OpInitMem, OpArg, OpPhi, OpVarDef, OpVarKill, OpVarLive, OpKeepAlive, OpSelect0, OpSelect1:
 				continue // ok not to lower
+			case OpGetG:
+				if f.Config.hasGReg {
+					// has hardware g register, regalloc takes care of it
+					continue // ok not to lower
+				}
 			}
-			s := "not lowered: " + v.Op.String() + " " + v.Type.SimpleString()
+			s := "not lowered: " + v.String() + ", " + v.Op.String() + " " + v.Type.SimpleString()
 			for _, a := range v.Args {
 				s += " " + a.Type.SimpleString()
 			}
--- a/src/cmd/compile/internal/ssa/op.go
+++ b/src/cmd/compile/internal/ssa/op.go
@@ -26,7 +26,8 @@ type opInfo struct {
 	generic           bool // this is a generic (arch-independent) opcode
 	rematerializeable bool // this op is rematerializeable
 	commutative       bool // this operation is commutative (e.g. addition)
-	resultInArg0      bool // v and v.Args[0] must be allocated to the same register
+	resultInArg0      bool // last output of v and v.Args[0] must be allocated to the same register
+	clobberFlags      bool // this op clobbers flags register
 }

 type inputInfo struct {
@@ -34,10 +35,15 @@ type inputInfo struct {
 	regs regMask // allowed input registers
 }

+type outputInfo struct {
+	idx  int     // index in output tuple
+	regs regMask // allowed output registers
+}
+
 type regInfo struct {
 	inputs   []inputInfo // ordered in register allocation order
 	clobbers regMask
-	outputs  []regMask // NOTE: values can only have 1 output for now.
+	outputs  []outputInfo // ordered in register allocation order
 }

 type auxType int8
@@ -124,3 +130,31 @@ func (x ValAndOff) add(off int64) int64 {
 	}
 	return makeValAndOff(x.Val(), x.Off()+off)
 }
+
+// SizeAndAlign holds both the size and the alignment of a type,
+// used in Zero and Move ops.
+// The high 8 bits hold the alignment.
+// The low 56 bits hold the size.
+type SizeAndAlign int64
+
+func (x SizeAndAlign) Size() int64 {
+	return int64(x) & (1<<56 - 1)
+}
+func (x SizeAndAlign) Align() int64 {
+	return int64(uint64(x) >> 56)
+}
+func (x SizeAndAlign) Int64() int64 {
+	return int64(x)
+}
+func (x SizeAndAlign) String() string {
+	return fmt.Sprintf("size=%d,align=%d", x.Size(), x.Align())
+}
+func MakeSizeAndAlign(size, align int64) SizeAndAlign {
+	if size&^(1<<56-1) != 0 {
+		panic("size too big in SizeAndAlign")
+	}
+	if align >= 1<<8 {
+		panic("alignment too big in SizeAndAlign")
+	}
+	return SizeAndAlign(size | align<<56)
+}
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/opt.go
+++ b/src/cmd/compile/internal/ssa/opt.go
@@ -11,4 +11,7 @@ func opt(f *Func) {

 func dec(f *Func) {
 	applyRewrite(f, rewriteBlockdec, rewriteValuedec)
+	if f.Config.IntSize == 4 && f.Config.arch != "amd64p32" {
+		applyRewrite(f, rewriteBlockdec64, rewriteValuedec64)
+	}
 }
--- a/src/cmd/compile/internal/ssa/phiopt.go
+++ b/src/cmd/compile/internal/ssa/phiopt.go
@@ -57,7 +57,16 @@ func phiopt(f *Func) {
 		}

 		for _, v := range b.Values {
-			if v.Op != OpPhi || !v.Type.IsBoolean() {
+			if v.Op != OpPhi {
+				continue
+			}
+
+			// Look for conversions from bool to 0/1.
+			if v.Type.IsInteger() {
+				phioptint(v, b0, reverse)
+			}
+
+			if !v.Type.IsBoolean() {
 				continue
 			}

@@ -110,5 +119,55 @@ func phiopt(f *Func) {
 			}
 		}
 	}
-
+}
+
+func phioptint(v *Value, b0 *Block, reverse int) {
+	a0 := v.Args[0]
+	a1 := v.Args[1]
+	if a0.Op != a1.Op {
+		return
+	}
+
+	switch a0.Op {
+	case OpConst8, OpConst16, OpConst32, OpConst64:
+	default:
+		return
+	}
+
+	negate := false
+	switch {
+	case a0.AuxInt == 0 && a1.AuxInt == 1:
+		negate = true
+	case a0.AuxInt == 1 && a1.AuxInt == 0:
+	default:
+		return
+	}
+
+	if reverse == 1 {
+		negate = !negate
+	}
+
+	switch v.Type.Size() {
+	case 1:
+		v.reset(OpCopy)
+	case 2:
+		v.reset(OpZeroExt8to16)
+	case 4:
+		v.reset(OpZeroExt8to32)
+	case 8:
+		v.reset(OpZeroExt8to64)
+	default:
+		v.Fatalf("bad int size %d", v.Type.Size())
+	}
+
+	a := b0.Control
+	if negate {
+		a = v.Block.NewValue1(v.Line, OpNot, a.Type, a)
+	}
+	v.AddArg(a)
+
+	f := b0.Func
+	if f.pass.debug > 0 {
+		f.Config.Warnl(v.Block.Line, "converted OpPhi bool -> int%d", v.Type.Size()*8)
+	}
 }
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -206,6 +206,7 @@ type regAllocState struct {
 	numRegs     register
 	SPReg       register
 	SBReg       register
+	GReg        register
 	allocatable regMask

 	// for each block, its primary predecessor.
@@ -332,14 +333,14 @@ func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
 	s.f.setHome(c, &s.registers[r])
 }

-// allocReg chooses a register for v from the set of registers in mask.
+// allocReg chooses a register from the set of registers in mask.
 // If there is no unused register, a Value will be kicked out of
 // a register to make room.
-func (s *regAllocState) allocReg(v *Value, mask regMask) register {
+func (s *regAllocState) allocReg(mask regMask, v *Value) register {
 	mask &= s.allocatable
 	mask &^= s.nospill
 	if mask == 0 {
-		s.f.Fatalf("no register available")
+		s.f.Fatalf("no register available for %s", v)
 	}

 	// Pick an unused register if one is available.
@@ -400,7 +401,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 	}

 	// Allocate a register.
-	r := s.allocReg(v, mask)
+	r := s.allocReg(mask, v)

 	// Allocate v to the new register.
 	var c *Value
@@ -438,28 +439,76 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 func (s *regAllocState) init(f *Func) {
 	s.f = f
 	s.registers = f.Config.registers
-	s.numRegs = register(len(s.registers))
-	if s.numRegs > noRegister || s.numRegs > register(unsafe.Sizeof(regMask(0))*8) {
-		panic("too many registers")
+	if nr := len(s.registers); nr == 0 || nr > int(noRegister) || nr > int(unsafe.Sizeof(regMask(0))*8) {
+		s.f.Fatalf("bad number of registers: %d", nr)
+	} else {
+		s.numRegs = register(nr)
 	}
+	// Locate SP, SB, and g registers.
+	s.SPReg = noRegister
+	s.SBReg = noRegister
+	s.GReg = noRegister
 	for r := register(0); r < s.numRegs; r++ {
-		if s.registers[r].Name() == "SP" {
+		switch s.registers[r].Name() {
+		case "SP":
 			s.SPReg = r
-		}
-		if s.registers[r].Name() == "SB" {
+		case "SB":
 			s.SBReg = r
+		case "g":
+			s.GReg = r
+		}
+	}
+	// Make sure we found all required registers.
+	switch noRegister {
+	case s.SPReg:
+		s.f.Fatalf("no SP register found")
+	case s.SBReg:
+		s.f.Fatalf("no SB register found")
+	case s.GReg:
+		if f.Config.hasGReg {
+			s.f.Fatalf("no g register found")
 		}
 	}

 	// Figure out which registers we're allowed to use.
-	s.allocatable = regMask(1)<<s.numRegs - 1
+	s.allocatable = s.f.Config.gpRegMask | s.f.Config.fpRegMask
 	s.allocatable &^= 1 << s.SPReg
 	s.allocatable &^= 1 << s.SBReg
-	if s.f.Config.ctxt.Framepointer_enabled {
-		s.allocatable &^= 1 << 5 // BP
+	if s.f.Config.hasGReg {
+		s.allocatable &^= 1 << s.GReg
+	}
+	if s.f.Config.ctxt.Framepointer_enabled && s.f.Config.FPReg >= 0 {
+		s.allocatable &^= 1 << uint(s.f.Config.FPReg)
 	}
 	if s.f.Config.ctxt.Flag_dynlink {
-		s.allocatable &^= 1 << 15 // R15
+		switch s.f.Config.arch {
+		case "amd64":
+			s.allocatable &^= 1 << 15 // R15
+		case "arm":
+			s.allocatable &^= 1 << 9 // R9
+		case "arm64":
+			// nothing to do?
+		case "386":
+			// nothing to do.
+			// Note that for Flag_shared (position independent code)
+			// we do need to be careful, but that carefulness is hidden
+			// in the rewrite rules so we always have a free register
+			// available for global load/stores. See gen/386.rules (search for Flag_shared).
+		default:
+			s.f.Config.fe.Unimplementedf(0, "arch %s not implemented", s.f.Config.arch)
+		}
+	}
+	if s.f.Config.nacl {
+		switch s.f.Config.arch {
+		case "arm":
+			s.allocatable &^= 1 << 9 // R9 is "thread pointer" on nacl/arm
+		case "amd64p32":
+			s.allocatable &^= 1 << 5  // BP - reserved for nacl
+			s.allocatable &^= 1 << 15 // R15 - reserved for nacl
+		}
+	}
+	if s.f.Config.use387 {
+		s.allocatable &^= 1 << 15 // X7 disallowed (one 387 register is used as scratch space during SSE->387 generation in ../x86/387.go)
 	}

 	s.regs = make([]regState, s.numRegs)
@@ -467,11 +516,13 @@ func (s *regAllocState) init(f *Func) {
 	s.orig = make([]*Value, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() {
+			if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && !v.Type.IsTuple() {
 				s.values[v.ID].needReg = true
 				s.values[v.ID].rematerializeable = v.rematerializeable()
 				s.orig[v.ID] = v
 			}
+			// Note: needReg is false for values returning Tuple types.
+			// Instead, we mark the corresponding Selects as needReg.
 		}
 	}
 	s.computeLive()
@@ -564,9 +615,9 @@ func (s *regAllocState) setState(regs []endReg) {
 func (s *regAllocState) compatRegs(t Type) regMask {
 	var m regMask
 	if t.IsFloat() || t == TypeInt128 {
-		m = 0xffff << 16 // X0-X15
+		m = s.f.Config.fpRegMask
 	} else {
-		m = 0xffff << 0 // AX-R15
+		m = s.f.Config.gpRegMask
 	}
 	return m & s.allocatable
 }
@@ -786,6 +837,9 @@ func (s *regAllocState) regalloc(f *Func) {
 				if phiRegs[i] != noRegister {
 					continue
 				}
+				if s.f.Config.use387 && v.Type.IsFloat() {
+					continue // 387 can't handle floats in registers between blocks
+				}
 				m := s.compatRegs(v.Type) &^ phiUsed &^ s.used
 				if m != 0 {
 					r := pickReg(m)
@@ -915,6 +969,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			if s.f.pass.debug > regDebug {
 				fmt.Printf("  processing %s\n", v.LongString())
 			}
+			regspec := opcodeTable[v.Op].reg
 			if v.Op == OpPhi {
 				f.Fatalf("phi %s not at start of block", v)
 			}
@@ -930,6 +985,28 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.advanceUses(v)
 				continue
 			}
+			if v.Op == OpSelect0 || v.Op == OpSelect1 {
+				if s.values[v.ID].needReg {
+					var i = 0
+					if v.Op == OpSelect1 {
+						i = 1
+					}
+					s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocPair)[i].(*Register).Num), v, v)
+				}
+				b.Values = append(b.Values, v)
+				s.advanceUses(v)
+				goto issueSpill
+			}
+			if v.Op == OpGetG && s.f.Config.hasGReg {
+				// use hardware g register
+				if s.regs[s.GReg].v != nil {
+					s.freeReg(s.GReg) // kick out the old value
+				}
+				s.assignReg(s.GReg, v, v)
+				b.Values = append(b.Values, v)
+				s.advanceUses(v)
+				goto issueSpill
+			}
 			if v.Op == OpArg {
 				// Args are "pre-spilled" values. We don't allocate
 				// any register here. We just set up the spill pointer to
@@ -957,7 +1034,6 @@ func (s *regAllocState) regalloc(f *Func) {
 				b.Values = append(b.Values, v)
 				continue
 			}
-			regspec := opcodeTable[v.Op].reg
 			if len(regspec.inputs) == 0 && len(regspec.outputs) == 0 {
 				// No register allocation required (or none specified yet)
 				s.freeRegs(regspec.clobbers)
@@ -1002,10 +1078,6 @@ func (s *regAllocState) regalloc(f *Func) {
 			args = append(args[:0], v.Args...)
 			for _, i := range regspec.inputs {
 				mask := i.regs
-				if mask == flagRegMask {
-					// TODO: remove flag input from regspec.inputs.
-					continue
-				}
 				if mask&s.values[args[i.idx].ID].regs == 0 {
 					// Need a new register for the input.
 					mask &= s.allocatable
@@ -1115,49 +1187,73 @@ func (s *regAllocState) regalloc(f *Func) {
 			// Dump any registers which will be clobbered
 			s.freeRegs(regspec.clobbers)

-			// Pick register for output.
-			if s.values[v.ID].needReg {
-				mask := regspec.outputs[0] & s.allocatable
-				if opcodeTable[v.Op].resultInArg0 {
-					if !opcodeTable[v.Op].commutative {
-						// Output must use the same register as input 0.
-						r := register(s.f.getHome(args[0].ID).(*Register).Num)
-						mask = regMask(1) << r
-					} else {
-						// Output must use the same register as input 0 or 1.
-						r0 := register(s.f.getHome(args[0].ID).(*Register).Num)
-						r1 := register(s.f.getHome(args[1].ID).(*Register).Num)
-						// Check r0 and r1 for desired output register.
-						found := false
-						for _, r := range dinfo[idx].out {
-							if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
-								mask = regMask(1) << r
-								found = true
-								if r == r1 {
-									args[0], args[1] = args[1], args[0]
+			// Pick registers for outputs.
+			{
+				outRegs := [2]register{noRegister, noRegister}
+				var used regMask
+				for _, out := range regspec.outputs {
+					mask := out.regs & s.allocatable &^ used
+					if mask == 0 {
+						continue
+					}
+					if opcodeTable[v.Op].resultInArg0 && out.idx == len(regspec.outputs)-1 {
+						if !opcodeTable[v.Op].commutative {
+							// Output must use the same register as input 0.
+							r := register(s.f.getHome(args[0].ID).(*Register).Num)
+							mask = regMask(1) << r
+						} else {
+							// Output must use the same register as input 0 or 1.
+							r0 := register(s.f.getHome(args[0].ID).(*Register).Num)
+							r1 := register(s.f.getHome(args[1].ID).(*Register).Num)
+							// Check r0 and r1 for desired output register.
+							found := false
+							for _, r := range dinfo[idx].out {
+								if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
+									mask = regMask(1) << r
+									found = true
+									if r == r1 {
+										args[0], args[1] = args[1], args[0]
+									}
+									break
 								}
-								break
+							}
+							if !found {
+								// Neither are desired, pick r0.
+								mask = regMask(1) << r0
 							}
 						}
-						if !found {
-							// Neither are desired, pick r0.
-							mask = regMask(1) << r0
+					}
+					for _, r := range dinfo[idx].out {
+						if r != noRegister && (mask&^s.used)>>r&1 != 0 {
+							// Desired register is allowed and unused.
+							mask = regMask(1) << r
+							break
 						}
 					}
+					// Avoid registers we're saving for other values.
+					if mask&^desired.avoid != 0 {
+						mask &^= desired.avoid
+					}
+					r := s.allocReg(mask, v)
+					outRegs[out.idx] = r
+					used |= regMask(1) << r
 				}
-				for _, r := range dinfo[idx].out {
-					if r != noRegister && (mask&^s.used)>>r&1 != 0 {
-						// Desired register is allowed and unused.
-						mask = regMask(1) << r
-						break
+				// Record register choices
+				if v.Type.IsTuple() {
+					var outLocs LocPair
+					if r := outRegs[0]; r != noRegister {
+						outLocs[0] = &s.registers[r]
+					}
+					if r := outRegs[1]; r != noRegister {
+						outLocs[1] = &s.registers[r]
+					}
+					s.f.setHome(v, outLocs)
+					// Note that subsequent SelectX instructions will do the assignReg calls.
+				} else {
+					if r := outRegs[0]; r != noRegister {
+						s.assignReg(r, v, v)
 					}
 				}
-				// Avoid registers we're saving for other values.
-				if mask&^desired.avoid != 0 {
-					mask &^= desired.avoid
-				}
-				r := s.allocReg(v, mask)
-				s.assignReg(r, v, v)
 			}

 			// Issue the Value itself.
@@ -1176,6 +1272,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			//     f()
 			// }
 			// It would be good to have both spill and restore inside the IF.
+		issueSpill:
 			if s.values[v.ID].needReg {
 				spill := b.NewValue1(v.Line, OpStoreReg, v.Type, v)
 				s.setOrig(spill, v)
@@ -1194,9 +1291,10 @@ func (s *regAllocState) regalloc(f *Func) {
 			if s.f.pass.debug > regDebug {
 				fmt.Printf("  processing control %s\n", v.LongString())
 			}
-			// TODO: regspec for block control values, instead of using
-			// register set from the control op's output.
-			s.allocValToReg(v, opcodeTable[v.Op].reg.outputs[0], false, b.Line)
+			// We assume that a control input can be passed in any
+			// type-compatible register. If this turns out not to be true,
+			// we'll need to introduce a regspec for a block's control value.
+			s.allocValToReg(v, s.compatRegs(v.Type), false, b.Line)
 			// Remove this use from the uses list.
 			vi := &s.values[v.ID]
 			u := vi.uses
@@ -1208,6 +1306,11 @@ func (s *regAllocState) regalloc(f *Func) {
 			s.freeUseRecords = u
 		}

+		// Spill any values that can't live across basic block boundaries.
+		if s.f.Config.use387 {
+			s.freeRegs(s.f.Config.fpRegMask)
+		}
+
 		// If we are approaching a merge point and we are the primary
 		// predecessor of it, find live values that we use soon after
 		// the merge point and promote them to registers now.
@@ -1231,6 +1334,9 @@ func (s *regAllocState) regalloc(f *Func) {
 					continue
 				}
 				v := s.orig[vid]
+				if s.f.Config.use387 && v.Type.IsFloat() {
+					continue // 387 can't handle floats in registers between blocks
+				}
 				m := s.compatRegs(v.Type) &^ s.used
 				if m&^desired.avoid != 0 {
 					m &^= desired.avoid
@@ -1769,6 +1875,9 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
 			(*splice).Uses--
 			*splice = occupant.c
 			occupant.c.Uses++
+			if occupant.c.Op == OpStoreReg {
+				e.s.lateSpillUse(vid)
+			}
 		}
 		// Note: if splice==nil then c will appear dead. This is
 		// non-SSA formed code, so be careful after this pass not to run
@@ -2010,6 +2119,8 @@ func (e *edgeState) findRegFor(typ Type) Location {
 	return nil
 }

+// rematerializeable reports whether the register allocator should recompute
+// a value instead of spilling/restoring it.
 func (v *Value) rematerializeable() bool {
 	if !opcodeTable[v.Op].rematerializeable {
 		return false
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -205,6 +205,11 @@ func is32Bit(n int64) bool {
 	return n == int64(int32(n))
 }

+// is16Bit reports whether n can be represented as a signed 16 bit integer.
+func is16Bit(n int64) bool {
+	return n == int64(int16(n))
+}
+
 // b2i translates a boolean value to 0 or 1 for assigning to auxInt.
 func b2i(b bool) int64 {
 	if b {
@@ -254,6 +259,19 @@ func isSamePtr(p1, p2 *Value) bool {
 	return false
 }

+// moveSize returns the number of bytes an aligned MOV instruction moves
+func moveSize(align int64, c *Config) int64 {
+	switch {
+	case align%8 == 0 && c.IntSize == 8:
+		return 8
+	case align%4 == 0:
+		return 4
+	case align%2 == 0:
+		return 2
+	}
+	return 1
+}
+
 // mergePoint finds a block among a's blocks which dominates b and is itself
 // dominated by all of a's blocks. Returns nil if it can't find one.
 // Might return nil even if one does exist.
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
--- a/src/cmd/compile/internal/ssa/rewritedec64.go
+++ b/src/cmd/compile/internal/ssa/rewritedec64.go
--- a/Show More
+++ b/Show More