misc/nacl: include parser.go for cmd/compile/internal/syntax tests

Fix suggested by Minux. Change-Id: Ia7aa8ccccc16535af4ec3ad23830ef0aa5d776ac Reviewed-on: https://go-review.googlesource.com/27193 Run-TryBot: Matthew Dempsky <mdempsky@google.com> Reviewed-by: Minux Ma <minux@golang.org>
cmd/compile/internal/syntax: match old parser errors and line numbers
2026-02-01 08:32:04 +03:00 · 2016-08-16 22:42:13 +00:00 · 2016-08-16 14:32:09 -07:00 · 2016-08-16 14:32:09 -07:00 · 2016-08-16 14:32:09 -07:00 · 2016-08-16 14:32:09 -07:00
470 changed files with 104322 additions and 11822 deletions
--- a/20
+++ b/20
@@ -105,6 +105,7 @@ Audrey Lim <audreylh@gmail.com>
 Augusto Roman <aroman@gmail.com>
 Aulus Egnatius Varialus <varialus@gmail.com>
 awaw fumin <awawfumin@gmail.com>
+Ayanamist Yang <ayanamist@gmail.com>
 Aymerick Jéhanne <aymerick@jehanne.org>
 Ben Burkert <ben@benburkert.com>
 Ben Olive <sionide21@gmail.com>
@@ -147,6 +148,7 @@ Chris Jones <chris@cjones.org>
 Chris Kastorff <encryptio@gmail.com>
 Chris Lennert <calennert@gmail.com>
 Chris McGee <sirnewton_01@yahoo.ca> <newton688@gmail.com>
+Christian Couder <chriscool@tuxfamily.org>
 Christian Himpel <chressie@googlemail.com>
 Christine Hansmann <chhansmann@gmail.com>
 Christoffer Buchholz <christoffer.buchholz@gmail.com>
@@ -237,8 +239,10 @@ Eivind Uggedal <eivind@uggedal.com>
 Elias Naur <elias.naur@gmail.com>
 Emil Hessman <c.emil.hessman@gmail.com> <emil@hessman.se>
 Emmanuel Odeke <emm.odeke@gmail.com> <odeke@ualberta.ca>
+Empirical Interfaces Inc.
 Eoghan Sherry <ejsherry@gmail.com>
 Eric Clark <zerohp@gmail.com>
+Eric Engestrom <eric@engestrom.ch>
 Eric Lagergren <ericscottlagergren@gmail.com>
 Eric Milliken <emilliken@gmail.com>
 Eric Roshan-Eisner <eric.d.eisner@gmail.com>
@@ -258,6 +262,7 @@ Fastly, Inc.
 Fatih Arslan <fatih@arslan.io>
 Fazlul Shahriar <fshahriar@gmail.com>
 Felix Geisendörfer <haimuiba@gmail.com>
+Filippo Valsorda <hi@filippo.io>
 Firmansyah Adiputra <frm.adiputra@gmail.com>
 Florian Uekermann <florian@uekermann-online.de>
 Florian Weimer <fw@deneb.enyo.de>
@@ -290,6 +295,8 @@ Guobiao Mei <meiguobiao@gmail.com>
 Gustav Paul <gustav.paul@gmail.com>
 Gustavo Niemeyer <gustavo@niemeyer.net>
 Gwenael Treguier <gwenn.kahz@gmail.com>
+Gyu-Ho Lee <gyuhox@gmail.com>
+H. İbrahim Güngör <igungor@gmail.com>
 Hajime Hoshi <hajimehoshi@gmail.com>
 Hari haran <hariharan.uno@gmail.com>
 Hariharan Srinath <srinathh@gmail.com>
@@ -321,6 +328,7 @@ Intel Corporation
 Irieda Noboru <irieda@gmail.com>
 Isaac Wagner <ibw@isaacwagner.me>
 Ivan Ukhov <ivan.ukhov@gmail.com>
+Jacob Hoffman-Andrews <github@hoffman-andrews.com>
 Jae Kwon <jae@tendermint.com>
 Jakob Borg <jakob@nym.se>
 Jakub Ryszard Czarnowicz <j.czarnowicz@gmail.com>
@@ -342,6 +350,7 @@ Jan Newmarch <jan.newmarch@gmail.com>
 Jan Ziak <0xe2.0x9a.0x9b@gmail.com>
 Jani Monoses <jani.monoses@ubuntu.com>
 Jaroslavas Počepko <jp@webmaster.ms>
+Jason Barnett <jason.w.barnett@gmail.com>
 Jason Del Ponte <delpontej@gmail.com>
 Jason Travis <infomaniac7@gmail.com>
 Jay Weisskopf <jay@jayschwa.net>
@@ -359,6 +368,7 @@ Jingcheng Zhang <diogin@gmail.com>
 Jingguo Yao <yaojingguo@gmail.com>
 Jiong Du <londevil@gmail.com>
 Joakim Sernbrant <serbaut@gmail.com>
+Joe Farrell <joe2farrell@gmail.com>
 Joe Harrison <joehazzers@gmail.com>
 Joe Henke <joed.henke@gmail.com>
 Joe Poirier <jdpoirier@gmail.com>
@@ -392,6 +402,7 @@ Joshua Chase <jcjoshuachase@gmail.com>
 Jostein Stuhaug <js@solidsystem.no>
 JT Olds <jtolds@xnet5.com>
 Jukka-Pekka Kekkonen <karatepekka@gmail.com>
+Julian Kornberger <jk+github@digineo.de>
 Julian Phillips <julian@quantumfyre.co.uk>
 Julien Schmidt <google@julienschmidt.com>
 Justin Nuß <nuss.justin@gmail.com>
@@ -502,6 +513,7 @@ Michael Vetter <g.bluehut@gmail.com>
 Michal Bohuslávek <mbohuslavek@gmail.com>
 Michał Derkacz <ziutek@lnet.pl>
 Miek Gieben <miek@miek.nl>
+Miguel Mendez <stxmendez@gmail.com>
 Mihai Borobocea <MihaiBorobocea@gmail.com>
 Mikael Tillenius <mikti42@gmail.com>
 Mike Andrews <mra@xoba.com>
@@ -530,6 +542,7 @@ Netflix, Inc.
 Nevins Bartolomeo <nevins.bartolomeo@gmail.com>
 ngmoco, LLC
 Niall Sheridan <nsheridan@gmail.com>
+Nic Day <nic.day@me.com>
 Nicholas Katsaros <nick@nickkatsaros.com>
 Nicholas Presta <nick@nickpresta.ca> <nick1presta@gmail.com>
 Nicholas Sullivan <nicholas.sullivan@gmail.com>
@@ -575,6 +588,7 @@ Paul Rosania <paul.rosania@gmail.com>
 Paul Sbarra <Sbarra.Paul@gmail.com>
 Paul Smith <paulsmith@pobox.com> <paulsmith@gmail.com>
 Paul van Brouwershaven <paul@vanbrouwershaven.com>
+Paulo Casaretto <pcasaretto@gmail.com>
 Pavel Paulau <pavel.paulau@gmail.com>
 Pavel Zinovkin <pavel.zinovkin@gmail.com>
 Pawel Knap <pawelknap88@gmail.com>
@@ -591,6 +605,7 @@ Péter Szilágyi <peterke@gmail.com>
 Peter Waldschmidt <peter@waldschmidt.com>
 Peter Waller <peter.waller@gmail.com>
 Peter Williams <pwil3058@gmail.com>
+Philip Børgesen <philip.borgesen@gmail.com>
 Philip Hofer <phofer@umich.edu>
 Philip K. Warren <pkwarren@gmail.com>
 Pierre Durand <pierredurand@gmail.com>
@@ -599,6 +614,7 @@ Pieter Droogendijk <pieter@binky.org.uk>
 Pietro Gagliardi <pietro10@mac.com>
 Prashant Varanasi <prashant@prashantv.com>
 Preetam Jinka <pj@preet.am>
+Quan Tran <qeed.quan@gmail.com>
 Quan Yong Zhai <qyzhai@gmail.com>
 Quentin Perez <qperez@ocs.online.net>
 Quoc-Viet Nguyen <afelion@gmail.com>
@@ -644,6 +660,7 @@ Salmān Aljammāz <s@0x65.net>
 Sam Hug <samuel.b.hug@gmail.com>
 Sam Whited <sam@samwhited.com>
 Sanjay Menakuru <balasanjay@gmail.com>
+Sasha Sobol <sasha@scaledinference.com>
 Scott Barron <scott.barron@github.com>
 Scott Bell <scott@sctsm.com>
 Scott Ferguson <scottwferg@gmail.com>
@@ -654,6 +671,7 @@ Sergei Skorobogatov <skorobo@rambler.ru>
 Sergey 'SnakE'  Gromov <snake.scaly@gmail.com>
 Sergio Luis O. B. Correia <sergio@correia.cc>
 Seth Hoenig <seth.a.hoenig@gmail.com>
+Seth Vargo <sethvargo@gmail.com>
 Shahar Kohanim <skohanim@gmail.com>
 Shane Hansen <shanemhansen@gmail.com>
 Shaozhen Ding <dsz0111@gmail.com>
@@ -663,6 +681,7 @@ Shinji Tanaka <shinji.tanaka@gmail.com>
 Shivakumar GN <shivakumar.gn@gmail.com>
 Silvan Jegen <s.jegen@gmail.com>
 Simon Jefford <simon.jefford@gmail.com>
+Simon Thulbourn <simon+github@thulbourn.com>
 Simon Whitehead <chemnova@gmail.com>
 Sokolov Yura <funny.falcon@gmail.com>
 Spencer Nelson <s@spenczar.com>
@@ -734,6 +753,7 @@ Wei Guangjing <vcc.163@gmail.com>
 Willem van der Schyff <willemvds@gmail.com>
 William Josephson <wjosephson@gmail.com>
 William Orr <will@worrbase.com> <ay1244@gmail.com>
+Wisdom Omuya <deafgoat@gmail.com>
 Xia Bin <snyh@snyh.org>
 Xing Xing <mikespook@gmail.com>
 Xudong Zhang <felixmelon@gmail.com>
--- a/28
+++ b/28
@@ -37,6 +37,7 @@ Aaron France <aaron.l.france@gmail.com>
 Aaron Jacobs <jacobsa@google.com>
 Aaron Kemp <kemp.aaron@gmail.com>
 Aaron Torres <tcboox@gmail.com>
+Aaron Zinman <aaron@azinman.com>
 Abe Haskins <abeisgreat@abeisgreat.com>
 Abhinav Gupta <abhinav.g90@gmail.com>
 Adam Langley <agl@golang.org>
@@ -141,6 +142,7 @@ Augusto Roman <aroman@gmail.com>
 Aulus Egnatius Varialus <varialus@gmail.com>
 Austin Clements <austin@google.com> <aclements@csail.mit.edu>
 awaw fumin <awawfumin@gmail.com>
+Ayanamist Yang <ayanamist@gmail.com>
 Aymerick Jéhanne <aymerick@jehanne.org>
 Balazs Lecz <leczb@google.com>
 Ben Burkert <ben@benburkert.com>
@@ -155,6 +157,7 @@ Benny Siegert <bsiegert@gmail.com>
 Benoit Sigoure <tsunanet@gmail.com>
 Berengar Lehr <Berengar.Lehr@gmx.de>
 Bill Neubauer <wcn@golang.org> <wcn@google.com> <bill.neubauer@gmail.com>
+Bill O'Farrell <billo@ca.ibm.com>
 Bill Thiede <couchmoney@gmail.com>
 Billie Harold Cleek <bhcleek@gmail.com>
 Bjorn Tillenius <bjorn@tillenius.me>
@@ -212,6 +215,7 @@ Chris Lennert <calennert@gmail.com>
 Chris Manghane <cmang@golang.org>
 Chris McGee <sirnewton_01@yahoo.ca> <newton688@gmail.com>
 Chris Zou <chriszou@ca.ibm.com>
+Christian Couder <chriscool@tuxfamily.org>
 Christian Himpel <chressie@googlemail.com> <chressie@gmail.com>
 Christine Hansmann <chhansmann@gmail.com>
 Christoffer Buchholz <christoffer.buchholz@gmail.com>
@@ -330,6 +334,7 @@ Emil Hessman <c.emil.hessman@gmail.com> <emil@hessman.se>
 Emmanuel Odeke <emm.odeke@gmail.com> <odeke@ualberta.ca>
 Eoghan Sherry <ejsherry@gmail.com>
 Eric Clark <zerohp@gmail.com>
+Eric Engestrom <eric@engestrom.ch>
 Eric Garrido <ekg@google.com>
 Eric Koleda <ekoleda+devrel@google.com>
 Eric Lagergren <ericscottlagergren@gmail.com>
@@ -356,6 +361,7 @@ Fatih Arslan <fatih@arslan.io>
 Fazlul Shahriar <fshahriar@gmail.com>
 Federico Simoncelli <fsimonce@redhat.com>
 Felix Geisendörfer <haimuiba@gmail.com>
+Filippo Valsorda <hi@filippo.io>
 Firmansyah Adiputra <frm.adiputra@gmail.com>
 Florian Uekermann <florian@uekermann-online.de> <f1@uekermann-online.de>
 Florian Weimer <fw@deneb.enyo.de>
@@ -397,6 +403,8 @@ Gustav Paul <gustav.paul@gmail.com>
 Gustavo Franco <gustavorfranco@gmail.com>
 Gustavo Niemeyer <gustavo@niemeyer.net> <n13m3y3r@gmail.com>
 Gwenael Treguier <gwenn.kahz@gmail.com>
+Gyu-Ho Lee <gyuhox@gmail.com>
+H. İbrahim Güngör <igungor@gmail.com>
 Hajime Hoshi <hajimehoshi@gmail.com>
 Hallgrimur Gunnarsson <halg@google.com>
 Han-Wen Nienhuys <hanwen@google.com>
@@ -435,6 +443,7 @@ Ivan Ukhov <ivan.ukhov@gmail.com>
 Jaana Burcu Dogan <jbd@google.com> <jbd@golang.org> <burcujdogan@gmail.com>
 Jacob Baskin <jbaskin@google.com>
 Jacob H. Haven <jacob@cloudflare.com>
+Jacob Hoffman-Andrews <github@hoffman-andrews.com>
 Jae Kwon <jae@tendermint.com>
 Jakob Borg <jakob@nym.se>
 Jakub Čajka <jcajka@redhat.com>
@@ -465,6 +474,7 @@ Jan Newmarch <jan.newmarch@gmail.com>
 Jan Ziak <0xe2.0x9a.0x9b@gmail.com>
 Jani Monoses <jani.monoses@ubuntu.com> <jani.monoses@gmail.com>
 Jaroslavas Počepko <jp@webmaster.ms>
+Jason Barnett <jason.w.barnett@gmail.com>
 Jason Del Ponte <delpontej@gmail.com>
 Jason Hall <jasonhall@google.com>
 Jason Travis <infomaniac7@gmail.com>
@@ -489,6 +499,7 @@ Jingcheng Zhang <diogin@gmail.com>
 Jingguo Yao <yaojingguo@gmail.com>
 Jiong Du <londevil@gmail.com>
 Joakim Sernbrant <serbaut@gmail.com>
+Joe Farrell <joe2farrell@gmail.com>
 Joe Harrison <joehazzers@gmail.com>
 Joe Henke <joed.henke@gmail.com>
 Joe Poirier <jdpoirier@gmail.com>
@@ -539,6 +550,7 @@ JP Sugarbroad <jpsugar@google.com>
 JT Olds <jtolds@xnet5.com>
 Jukka-Pekka Kekkonen <karatepekka@gmail.com>
 Julia Hansbrough <flowerhack@google.com>
+Julian Kornberger <jk+github@digineo.de>
 Julian Phillips <julian@quantumfyre.co.uk>
 Julien Schmidt <google@julienschmidt.com>
 Jungho Ahn <jhahn@google.com>
@@ -548,6 +560,7 @@ Kai Backman <kaib@golang.org>
 Kamal Aboul-Hosn <aboulhosn@google.com>
 Kamil Kisiel <kamil@kamilkisiel.net> <kamil.kisiel@gmail.com>
 Kang Hu <hukangustc@gmail.com>
+Karan Dhiman <karandhi@ca.ibm.com>
 Kato Kazuyoshi <kato.kazuyoshi@gmail.com>
 Katrina Owen <katrina.owen@gmail.com>
 Kay Zhu <kayzhu@google.com>
@@ -575,6 +588,7 @@ Kim Shrier <kshrier@racktopsystems.com>
 Kirklin McDonald <kirklin.mcdonald@gmail.com>
 Klaus Post <klauspost@gmail.com>
 Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
+Kris Rousey <krousey@google.com>
 Kristopher Watts <traetox@gmail.com>
 Kun Li <likunarmstrong@gmail.com>
 Kyle Consalus <consalus@gmail.com>
@@ -686,6 +700,7 @@ Michał Derkacz <ziutek@lnet.pl>
 Michalis Kargakis <michaliskargakis@gmail.com>
 Michel Lespinasse <walken@google.com>
 Miek Gieben <miek@miek.nl> <remigius.gieben@gmail.com>
+Miguel Mendez <stxmendez@gmail.com>
 Mihai Borobocea <MihaiBorobocea@gmail.com>
 Mikael Tillenius <mikti42@gmail.com>
 Mike Andrews <mra@xoba.com>
@@ -716,6 +731,7 @@ Nathan(yinian) Hu <nathanhu@google.com>
 Neelesh Chandola <neelesh.c98@gmail.com>
 Nevins Bartolomeo <nevins.bartolomeo@gmail.com>
 Niall Sheridan <nsheridan@gmail.com>
+Nic Day <nic.day@me.com>
 Nicholas Katsaros <nick@nickkatsaros.com>
 Nicholas Presta <nick@nickpresta.ca> <nick1presta@gmail.com>
 Nicholas Sullivan <nicholas.sullivan@gmail.com>
@@ -769,6 +785,7 @@ Paul Sbarra <Sbarra.Paul@gmail.com>
 Paul Smith <paulsmith@pobox.com> <paulsmith@gmail.com>
 Paul van Brouwershaven <paul@vanbrouwershaven.com>
 Paul Wankadia <junyer@google.com>
+Paulo Casaretto <pcasaretto@gmail.com>
 Pavel Paulau <pavel.paulau@gmail.com>
 Pavel Zinovkin <pavel.zinovkin@gmail.com>
 Pawel Knap <pawelknap88@gmail.com>
@@ -793,6 +810,7 @@ Peter Waller <peter.waller@gmail.com>
 Peter Weinberger <pjw@golang.org>
 Peter Williams <pwil3058@gmail.com>
 Phil Pennock <pdp@golang.org>
+Philip Børgesen <philip.borgesen@gmail.com>
 Philip Hofer <phofer@umich.edu>
 Philip K. Warren <pkwarren@gmail.com>
 Pierre Durand <pierredurand@gmail.com>
@@ -801,6 +819,7 @@ Pieter Droogendijk <pieter@binky.org.uk>
 Pietro Gagliardi <pietro10@mac.com>
 Prashant Varanasi <prashant@prashantv.com>
 Preetam Jinka <pj@preet.am>
+Quan Tran <qeed.quan@gmail.com>
 Quan Yong Zhai <qyzhai@gmail.com>
 Quentin Perez <qperez@ocs.online.net>
 Quentin Smith <quentin@golang.org>
@@ -857,7 +876,9 @@ Ryan Lower <rpjlower@gmail.com>
 Ryan Seys <ryan@ryanseys.com>
 Ryan Slade <ryanslade@gmail.com>
 S.Çağlar Onur <caglar@10ur.org>
+Sai Cheemalapati <saicheems@google.com>
 Salmān Aljammāz <s@0x65.net>
+Sam Ding <samding@ca.ibm.com>
 Sam Hug <samuel.b.hug@gmail.com>
 Sam Thorogood <thorogood@google.com> <sam.thorogood@gmail.com>
 Sam Whited <sam@samwhited.com>
@@ -865,6 +886,7 @@ Sameer Ajmani <sameer@golang.org> <ajmani@gmail.com>
 Sami Commerot <samic@google.com>
 Sanjay Menakuru <balasanjay@gmail.com>
 Sasha Lionheart <lionhearts@google.com>
+Sasha Sobol <sasha@scaledinference.com>
 Scott Barron <scott.barron@github.com>
 Scott Bell <scott@sctsm.com>
 Scott Ferguson <scottwferg@gmail.com>
@@ -882,6 +904,7 @@ Sergey 'SnakE' Gromov <snake.scaly@gmail.com>
 Sergey Arseev <sergey.arseev@intel.com>
 Sergio Luis O. B. Correia <sergio@correia.cc>
 Seth Hoenig <seth.a.hoenig@gmail.com>
+Seth Vargo <sethvargo@gmail.com>
 Shahar Kohanim <skohanim@gmail.com>
 Shane Hansen <shanemhansen@gmail.com>
 Shaozhen Ding <dsz0111@gmail.com>
@@ -894,6 +917,7 @@ Shivakumar GN <shivakumar.gn@gmail.com>
 Shun Fan <sfan@google.com>
 Silvan Jegen <s.jegen@gmail.com>
 Simon Jefford <simon.jefford@gmail.com>
+Simon Thulbourn <simon+github@thulbourn.com>
 Simon Whitehead <chemnova@gmail.com>
 Sokolov Yura <funny.falcon@gmail.com>
 Spencer Nelson <s@spenczar.com>
@@ -957,6 +981,7 @@ Totoro W <tw19881113@gmail.com>
 Travis Cline <travis.cline@gmail.com>
 Trevor Strohman <trevor.strohman@gmail.com>
 Trey Tacon <ttacon@gmail.com>
+Tristan Amini <tamini01@ca.ibm.com>
 Tudor Golubenco <tudor.g@gmail.com>
 Tyler Bunnell <tylerbunnell@gmail.com>
 Tyler Treat <ttreat31@gmail.com>
@@ -986,6 +1011,7 @@ Willem van der Schyff <willemvds@gmail.com>
 William Chan <willchan@chromium.org>
 William Josephson <wjosephson@gmail.com>
 William Orr <will@worrbase.com> <ay1244@gmail.com>
+Wisdom Omuya <deafgoat@gmail.com>
 Xia Bin <snyh@snyh.org>
 Xing Xing <mikespook@gmail.com>
 Xudong Zhang <felixmelon@gmail.com>
@@ -999,6 +1025,8 @@ Yissakhar Z. Beck <yissakhar.beck@gmail.com>
 Yo-An Lin <yoanlin93@gmail.com>
 Yongjian Xu <i3dmaster@gmail.com>
 Yoshiyuki Kanno <nekotaroh@gmail.com> <yoshiyuki.kanno@stoic.co.jp>
+Yu Heng Zhang <annita.zhang@cn.ibm.com>
+Yu Xuan Zhang <zyxsh@cn.ibm.com>
 Yuki Yugui Sonoda <yugui@google.com>
 Yusuke Kagiwada <block.rxckin.beats@gmail.com>
 Yuusei Kuwana <kuwana@kumama.org>
--- a/api/except.txt
+++ b/api/except.txt
@@ -329,3 +329,4 @@ pkg syscall (netbsd-arm-cgo), const SizeofIfData = 132
 pkg syscall (netbsd-arm-cgo), type IfMsghdr struct, Pad_cgo_1 [4]uint8
 pkg unicode, const Version = "6.3.0"
 pkg unicode, const Version = "7.0.0"
+pkg unicode, const Version = "8.0.0"
--- a/api/go1.7.txt
+++ b/api/go1.7.txt
@@ -274,3 +274,12 @@ pkg syscall (linux-arm-cgo), type SysProcAttr struct, Unshareflags uintptr
 pkg testing, method (*B) Run(string, func(*B)) bool
 pkg testing, method (*T) Run(string, func(*T)) bool
 pkg testing, type InternalExample struct, Unordered bool
+pkg unicode, const Version = "9.0.0"
+pkg unicode, var Adlam *RangeTable
+pkg unicode, var Bhaiksuki *RangeTable
+pkg unicode, var Marchen *RangeTable
+pkg unicode, var Newa *RangeTable
+pkg unicode, var Osage *RangeTable
+pkg unicode, var Prepended_Concatenation_Mark *RangeTable
+pkg unicode, var Sentence_Terminal *RangeTable
+pkg unicode, var Tangut *RangeTable
--- a/doc/asm.html
+++ b/doc/asm.html
@@ -780,6 +780,64 @@ mode as on the x86, but the only scale allowed is <code>1</code>.

 </ul>

+<h3 id="s390x">IBM z/Architecture, a.k.a. s390x</h3>
+
+<p>
+The registers <code>R10</code> and <code>R11</code> are reserved.
+The assembler uses them to hold temporary values when assembling some instructions.
+</p>
+
+<p>
+<code>R13</code> points to the <code>g</code> (goroutine) structure.
+This register must be referred to as <code>g</code>; the name <code>R13</code> is not recognized.
+</p>
+
+<p>
+<code>R15</code> points to the stack frame and should typically only be accessed using the
+virtual registers <code>SP</code> and <code>FP</code>.
+</p>
+
+<p>
+Load- and store-multiple instructions operate on a range of registers.
+The range of registers is specified by a start register and an end register.
+For example, <code>LMG</code> <code>(R9),</code> <code>R5,</code> <code>R7</code> would load
+<code>R5</code>, <code>R6</code> and <code>R7</code> with the 64-bit values at
+<code>0(R9)</code>, <code>8(R9)</code> and <code>16(R9)</code> respectively.
+</p>
+
+<p>
+Storage-and-storage instructions such as <code>MVC</code> and <code>XC</code> are written
+with the length as the first argument.
+For example, <code>XC</code> <code>$8,</code> <code>(R9),</code> <code>(R9)</code> would clear
+eight bytes at the address specified in <code>R9</code>.
+</p>
+
+<p>
+If a vector instruction takes a length or an index as an argument then it will be the
+first argument.
+For example, <code>VLEIF</code> <code>$1,</code> <code>$16,</code> <code>V2</code> will load
+the value sixteen into index one of <code>V2</code>.
+Care should be taken when using vector instructions to ensure that they are available at
+runtime.
+To use vector instructions a machine must have both the vector facility (bit 129 in the
+facility list) and kernel support.
+Without kernel support a vector instruction will have no effect (it will be equivalent
+to a <code>NOP</code> instruction).
+</p>
+
+<p>
+Addressing modes:
+</p>
+
+<ul>
+
+<li>
+<code>(R5)(R6*1)</code>: The location at <code>R5</code> plus <code>R6</code>.
+It is a scaled mode as on the x86, but the only scale allowed is <code>1</code>.
+</li>
+
+</ul>
+
 <h3 id="unsupported_opcodes">Unsupported opcodes</h3>

 <p>
--- a/doc/contrib.html
+++ b/doc/contrib.html
@@ -34,6 +34,7 @@ We encourage all Go users to subscribe to
 <p>A <a href="/doc/devel/release.html">summary</a> of the changes between Go releases. Notes for the major releases:</p>

 <ul>
+	<li><a href="/doc/go1.7">Go 1.7</a> <small>(August 2016)</small></li>
 	<li><a href="/doc/go1.6">Go 1.6</a> <small>(February 2016)</small></li>
 	<li><a href="/doc/go1.5">Go 1.5</a> <small>(August 2015)</small></li>
 	<li><a href="/doc/go1.4">Go 1.4</a> <small>(December 2014)</small></li>
--- a/doc/devel/release.html
+++ b/doc/devel/release.html
@@ -30,6 +30,13 @@ to fix critical security problems in both Go 1.4 and Go 1.5 as they arise.
 See the <a href="/security">security policy</a> for more details.
 </p>

+<h2 id="go1.7">go1.7 (released 2016/08/15)</h2>
+
+<p>
+Go 1.7 is a major release of Go.
+Read the <a href="/doc/go1.7">Go 1.7 Release Notes</a> for more information.
+</p>
+
 <h2 id="go1.6">go1.6 (released 2016/02/17)</h2>

 <p>
@@ -53,6 +60,14 @@ See the <a href="https://github.com/golang/go/issues?q=milestone%3AGo1.6.2">Go
 1.6.2 milestone</a> on our issue tracker for details.
 </p>

+<p>
+go1.6.3 (released 2016/07/17) includes security fixes to the
+<code>net/http/cgi</code> package and <code>net/http</code> package when used in
+a CGI environment. This release also adds support for macOS Sierra.
+See the <a href="https://github.com/golang/go/issues?q=milestone%3AGo1.6.3">Go
+1.6.3 milestone</a> on our issue tracker for details.
+</p>
+
 <h2 id="go1.5">go1.5 (released 2015/08/19)</h2>

 <p>
--- a/doc/effective_go.html
+++ b/doc/effective_go.html
@@ -2238,13 +2238,12 @@ if str, ok := value.(string); ok {

 <h3 id="generality">Generality</h3>
 <p>
-If a type exists only to implement an interface
-and has no exported methods beyond that interface,
-there is no need to export the type itself.
-Exporting just the interface makes it clear that
-it's the behavior that matters, not the implementation,
-and that other implementations with different properties
-can mirror the behavior of the original type.
+If a type exists only to implement an interface and will
+never have exported methods beyond that interface, there is
+no need to export the type itself.
+Exporting just the interface makes it clear the value has no
+interesting behavior beyond what is described in the
+interface.
 It also avoids the need to repeat the documentation
 on every instance of a common method.
 </p>
@@ -3665,4 +3664,3 @@ var _ image.Color = Black
 var _ image.Image = Black
 </pre>
 -->
-
--- a/doc/go1.7.html
+++ b/doc/go1.7.html
@@ -1,5 +1,5 @@
 <!--{
-	"Title": "Go 1.7 Release Notes DRAFT",
+	"Title": "Go 1.7 Release Notes",
 	"Path":  "/doc/go1.7",
 	"Template": true
 }-->
@@ -25,15 +25,6 @@ Do not send CLs removing the interior tags from such phrases.
 ul li { margin: 0.5em 0; }
 </style>

-<p>
-<!-- TODO: REMOVE THIS COMMENT -->
-<!-- TODO: Also remove "DRAFT" in the "Title" at the top of this file. -->
-<i>NOTE: This is a DRAFT of the Go 1.7 release notes, prepared for the Go 1.7 beta.
-Go 1.7 has NOT yet been released.
-By our regular schedule, it is expected some time in August 2016.
-</i>
-</p>
-
 <h2 id="introduction">Introduction to Go 1.7</h2>

 <p>
@@ -74,6 +65,13 @@ This change has no effect on the correctness of existing programs.

 <h2 id="ports">Ports</h2>

+<p>
+Go 1.7 adds support for macOS 10.12 Sierra.
+This support was backported to Go 1.6.3.
+Binaries built with versions of Go before 1.6.3 will not work
+correctly on Sierra.
+</p>
+
 <p>
 Go 1.7 adds an experimental port to <a href="https://en.wikipedia.org/wiki/Linux_on_z_Systems">Linux on z Systems</a> (<code>linux/s390x</code>)
 and the beginning of a port to Plan 9 on ARM (<code>plan9/arm</code>).
@@ -85,14 +83,27 @@ added in Go 1.6 now have full support for cgo and external linking.
 </p>

 <p>
-The experimental port to Linux on big-endian 64-bit PowerPC (<code>linux/ppc64</code>)
+The experimental port to Linux on little-endian 64-bit PowerPC (<code>linux/ppc64le</code>)
 now requires the POWER8 architecture or later.
+Big-endian 64-bit PowerPC (<code>linux/ppc64</code>) only requires the
+POWER5 architecture.
 </p>

 <p>
 The OpenBSD port now requires OpenBSD 5.6 or later, for access to the <a href="http://man.openbsd.org/getentropy.2"><i>getentropy</i>(2)</a> system call.
 </p>

+<h3 id="known_issues">Known Issues</h3>
+
+<p>
+There are some instabilities on FreeBSD that are known but not understood.
+These can lead to program crashes in rare cases.
+See <a href="https://golang.org/issue/16136">issue 16136</a>,
+<a href="https://golang.org/issue/15658">issue 15658</a>,
+and <a href="https://golang.org/issue/16396">issue 16396</a>.
+Any help in solving these FreeBSD-specific issues would be appreciated.
+</p>
+
 <h2 id="tools">Tools</h2>

 <h3 id="cmd_asm">Assembler</h3>
@@ -288,6 +299,18 @@ To avoid confusion with the new <code>-tests</code> check, the old, unadvertised
 <code>-test</code> option has been removed; it was equivalent to <code>-all</code> <code>-shadow</code>.
 </p>

+<p id="vet_lostcancel">
+The <code>vet</code> command also has a new check,
+<code>-lostcancel</code>, which detects failure to call the
+cancelation function returned by the <code>WithCancel</code>,
+<code>WithTimeout</code>, and <code>WithDeadline</code> functions in
+Go 1.7's new <code>context</code> package (see <a
+href='#context'>below</a>).
+Failure to call the function prevents the new <code>Context</code>
+from being reclaimed until its parent is cancelled.
+(The background context is never cancelled.)
+</p>
+
 <h3 id="cmd_dist">Go tool dist</h3>

 <p>
@@ -335,7 +358,7 @@ the code generation changes alone typically reduce program CPU time by 5-35%.
 </p>

 <p>
-<!-- git log &#45&#45grep '-[0-9][0-9]\.[0-9][0-9]%' go1.6.. -->
+<!-- git log -''-grep '-[0-9][0-9]\.[0-9][0-9]%' go1.6.. -->
 There have been significant optimizations bringing more than 10% improvements
 to implementations in the
 <a href="/pkg/crypto/sha1/"><code>crypto/sha1</code></a>,
@@ -355,6 +378,12 @@ and
 packages.
 </p>

+<p>
+Garbage collection pauses should be significantly shorter than they
+were in Go 1.6 for programs with large numbers of idle goroutines,
+substantial stack size fluctuation, or large package-level variables.
+</p>
+
 <h2 id="library">Core library</h2>

 <h3 id="context">Context</h3>
@@ -450,6 +479,13 @@ eliminating the
 common in some environments.
 </p>

+<p>
+The runtime can now return unused memory to the operating system on
+all architectures.
+In Go 1.6 and earlier, the runtime could not
+release memory on ARM64, 64-bit PowerPC, or MIPS.
+</p>
+
 <p>
 On Windows, Go programs in Go 1.5 and earlier forced
 the global Windows timer resolution to 1ms at startup
@@ -517,10 +553,9 @@ The

 <dd>
 <p>
-As noted above,
-there are significant performance optimizations throughout the package.
+There are many performance optimizations throughout the package.
 Decompression speed is improved by about 10%,
-while compression speed for <code>DefaultCompression</code> is roughly doubled.
+while compression for <code>DefaultCompression</code> is twice as fast.
 </p>

 <p>
@@ -770,6 +805,30 @@ package.
 </dd>
 </dl>

+<dl id="math_rand"><dt><a href="/pkg/math/rand/">math/rand</a></dt>
+
+<dd>
+<p>
+The
+<a href="/pkg/math/rand/#Read"><code>Read</code></a> function and
+<a href="/pkg/math/rand/#Rand"><code>Rand</code></a>'s
+<a href="/pkg/math/rand/#Rand.Read"><code>Read</code></a> method
+now produce a pseudo-random stream of bytes that is consistent and not
+dependent on the size of the input buffer.
+</p>
+
+<p>
+The documentation clarifies that
+Rand's <a href="/pkg/math/rand/#Rand.Seed"><code>Seed</code></a>
+and <a href="/pkg/math/rand/#Rand.Read"><code>Read</code></a> methods
+are not safe to call concurrently, though the global
+functions <a href="/pkg/math/rand/#Seed"><code>Seed</code></a>
+and <a href="/pkg/math/rand/#Read"><code>Read</code></a> are (and have
+always been) safe.
+</p>
+</dd>
+</dl>
+
 <dl id="mime_multipart"><dt><a href="/pkg/mime/multipart/">mime/multipart</a></dt>

 <dd>
@@ -847,6 +906,12 @@ For example, the address on which a request received is
 <code>req.Context().Value(http.LocalAddrContextKey).(net.Addr)</code>.
 </p>

+<p>
+The server's <a href="/pkg/net/http/#Server.Serve"><code>Serve</code></a> method
+now only enables HTTP/2 support if the <code>Server.TLSConfig</code> field is <code>nil</code>
+or includes <code>"h2"</code> in its <code>TLSConfig.NextProto</code>.
+</p>
+
 <p>
 The server implementation now
 pads response codes less than 100 to three digits
@@ -855,6 +920,23 @@ so that <code>w.WriteHeader(5)</code> uses the HTTP response
 status <code>005</code>, not just <code>5</code>.
 </p>

+<p>
+The server implementation now correctly sends only one "Transfer-Encoding" header when "chunked"
+is set explicitly, following <a href="https://tools.ietf.org/html/rfc7230#section-3.3.1">RFC 7230</a>.
+</p>
+
+<p>
+The server implementation is now stricter about rejecting requests with invalid HTTP versions.
+Invalid requests claiming to be HTTP/0.x are now rejected (HTTP/0.9 was never fully supported),
+and plaintext HTTP/2 requests other than the "PRI * HTTP/2.0" upgrade request are now rejected as well.
+The server continues to handle encrypted HTTP/2 requests.
+</p>
+
+<p>
+In the server, a 200 status code is sent back by the timeout handler on an empty
+response body, instead of sending back 0 as the status code.
+</p>
+
 <p>
 In the client, the
 <a href="/pkg/net/http/#Transport"><code>Transport</code></a> implementation passes the request context
@@ -1026,7 +1108,7 @@ from URLs with empty query strings (like <code>/search?</code>).

 <dd>
 <p>
-<a href="/pkg/os/#IsExists"><code>IsExists</code></a> now returns true for <code>syscall.ENOTEMPTY</code>,
+<a href="/pkg/os/#IsExist"><code>IsExist</code></a> now returns true for <code>syscall.ENOTEMPTY</code>,
 on systems where that error exists.
 </p>

@@ -1186,3 +1268,15 @@ system call before executing the new program.
 </p>
 </dd>
 </dl>
+
+
+<dl id="unicode"><dt><a href="/pkg/unicode/">unicode</a></dt>
+
+<dd>
+<p>
+The <a href="/pkg/unicode/"><code>unicode</code></a> package and associated
+support throughout the system has been upgraded from version 8.0 to
+<a href="http://www.unicode.org/versions/Unicode9.0.0/">Unicode 9.0</a>.
+</p>
+</dd>
+</dl>
--- a/doc/install-source.html
+++ b/doc/install-source.html
@@ -33,7 +33,7 @@ compiler using the GCC back end, see
 </p>

 <p>
-The Go compilers support six instruction sets.
+The Go compilers support seven instruction sets.
 There are important differences in the quality of the compilers for the different
 architectures.
 </p>
@@ -43,15 +43,17 @@ architectures.
 	<code>amd64</code> (also known as <code>x86-64</code>)
 </dt>
 <dd>
-	A mature implementation. The compiler has an effective
-	optimizer (registerizer) and generates good code (although
-	<code>gccgo</code> can do noticeably better sometimes).
+	A mature implementation. New in 1.7 is its SSA-based back end
+	that generates compact, efficient code.
 </dd>
 <dt>
 	<code>386</code> (<code>x86</code> or <code>x86-32</code>)
 </dt>
 <dd>
-	Comparable to the <code>amd64</code> port.
+	Comparable to the <code>amd64</code> port, but does
+	not yet use the SSA-based back end. It has an effective
+	optimizer (registerizer) and generates good code (although
+	<code>gccgo</code> can do noticeably better sometimes).
 </dd>
 <dt>
 	<code>arm</code> (<code>ARM</code>)
@@ -77,6 +79,12 @@ architectures.
 <dd>
 	Supports Linux binaries. New in 1.6 and not as well exercised as other ports.
 </dd>
+<dt>
+	<code>s390x</code> (IBM System z)
+</dt>
+<dd>
+	Supports Linux binaries. New in 1.7 and not as well exercised as other ports.
+</dd>
 </dl>

 <p>
@@ -195,7 +203,7 @@ To build without <code>cgo</code>, set the environment variable
 Change to the directory that will be its parent
 and make sure the <code>go</code> directory does not exist.
 Then clone the repository and check out the latest release tag
-(<code class="versionTag">go1.6</code>, for example):</p>
+(<code class="versionTag">go1.7</code>, for example):</p>

 <pre>
 $ git clone https://go.googlesource.com/go
@@ -383,7 +391,7 @@ New releases are announced on the
 <a href="//groups.google.com/group/golang-announce">golang-announce</a>
 mailing list.
 Each announcement mentions the latest release tag, for instance,
-<code class="versionTag">go1.6</code>.
+<code class="versionTag">go1.7</code>.
 </p>

 <p>
--- a/doc/install.html
+++ b/doc/install.html
@@ -17,7 +17,7 @@
 <p>
 <a href="https://golang.org/dl/" target="_blank">Official binary
 distributions</a> are available for the FreeBSD (release 8-STABLE and above),
-Linux, Mac OS X (10.7 and above), and Windows operating systems and
+Linux, Mac OS X (10.8 and above), and Windows operating systems and
 the 32-bit (<code>386</code>) and 64-bit (<code>amd64</code>) x86 processor
 architectures.
 </p>
@@ -49,7 +49,7 @@ If your OS or architecture is not on the list, you may be able to
 <tr><td colspan="3"><hr></td></tr>
 <tr><td>FreeBSD 8-STABLE or later</td> <td>amd64</td> <td>Debian GNU/kFreeBSD not supported</td></tr>
 <tr><td>Linux 2.6.23 or later with glibc</td> <td>amd64, 386, arm</td> <td>CentOS/RHEL 5.x not supported</td></tr>
-<tr><td>Mac OS X 10.7 or later</td> <td>amd64</td> <td>use the clang or gcc<sup>&#8224;</sup> that comes with Xcode<sup>&#8225;</sup></td></tr>
+<tr><td>Mac OS X 10.7 or later</td> <td>amd64</td> <td>use the clang or gcc<sup>&#8224;</sup> that comes with Xcode<sup>&#8225;</sup> for <code>cgo</code> support</td></tr>
 <tr><td>Windows XP or later</td> <td>amd64, 386</td> <td>use MinGW gcc<sup>&#8224;</sup>. No need for cygwin or msys.</td></tr>
 </table>

--- a/lib/time/update.bash
+++ b/lib/time/update.bash
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # Copyright 2012 The Go Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
@@ -8,8 +8,8 @@
 # Consult http://www.iana.org/time-zones for the latest versions.

 # Versions to use.
-CODE=2016d
-DATA=2016d
+CODE=2016f
+DATA=2016f

 set -e
 rm -rf work
--- a/lib/time/zoneinfo.zip
+++ b/lib/time/zoneinfo.zip
--- a/misc/cgo/errors/issue16116.go
+++ b/misc/cgo/errors/issue16116.go
@@ -0,0 +1,12 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// void f(void *p, int x) {}
+import "C"
+
+func main() {
+	_ = C.f(1) // ERROR HERE
+}
--- a/misc/cgo/errors/test.bash
+++ b/misc/cgo/errors/test.bash
@@ -45,6 +45,7 @@ expect issue13129.go C.ushort
 check issue13423.go
 expect issue13635.go C.uchar C.schar C.ushort C.uint C.ulong C.longlong C.ulonglong C.complexfloat C.complexdouble
 check issue13830.go
+check issue16116.go

 if ! go build issue14669.go; then
 	exit 1
--- a/misc/nacl/testzip.proto
+++ b/misc/nacl/testzip.proto
@@ -18,6 +18,10 @@ go	src=..
 					asm
 						testdata
 							+
+			compile
+				internal
+					syntax
+						parser.go
 			doc
 				main.go
 				pkg.go
--- a/misc/trace/trace_viewer_lean.html
+++ b/misc/trace/trace_viewer_lean.html
@@ -5281,7 +5281,7 @@ if(traces.length&&!this.hasEventDataDecoder_(importers)){throw new Error('Could
 importers.sort(function(x,y){return x.importPriority-y.importPriority;});},this);lastTask=lastTask.timedAfter('TraceImport',function importClockSyncMarkers(task){importers.forEach(function(importer,index){task.subTask(Timing.wrapNamedFunction('TraceImport',importer.importerName,function runImportClockSyncMarkersOnOneImporter(){progressMeter.update('Importing clock sync markers '+(index+1)+' of '+
 importers.length);importer.importClockSyncMarkers();}),this);},this);},this);lastTask=lastTask.timedAfter('TraceImport',function runImport(task){importers.forEach(function(importer,index){task.subTask(Timing.wrapNamedFunction('TraceImport',importer.importerName,function runImportEventsOnOneImporter(){progressMeter.update('Importing '+(index+1)+' of '+importers.length);importer.importEvents();}),this);},this);},this);if(this.importOptions_.customizeModelCallback){lastTask=lastTask.timedAfter('TraceImport',function runCustomizeCallbacks(task){this.importOptions_.customizeModelCallback(this.model_);},this);}
 lastTask=lastTask.timedAfter('TraceImport',function importSampleData(task){importers.forEach(function(importer,index){progressMeter.update('Importing sample data '+(index+1)+'/'+importers.length);importer.importSampleData();},this);},this);lastTask=lastTask.timedAfter('TraceImport',function runAutoclosers(){progressMeter.update('Autoclosing open slices...');this.model_.autoCloseOpenSlices();this.model_.createSubSlices();},this);lastTask=lastTask.timedAfter('TraceImport',function finalizeImport(task){importers.forEach(function(importer,index){progressMeter.update('Finalizing import '+(index+1)+'/'+importers.length);importer.finalizeImport();},this);},this);lastTask=lastTask.timedAfter('TraceImport',function runPreinits(){progressMeter.update('Initializing objects (step 1/2)...');this.model_.preInitializeObjects();},this);if(this.importOptions_.pruneEmptyContainers){lastTask=lastTask.timedAfter('TraceImport',function runPruneEmptyContainers(){progressMeter.update('Pruning empty containers...');this.model_.pruneEmptyContainers();},this);}
-lastTask=lastTask.timedAfter('TraceImport',function runMergeKernelWithuserland(){progressMeter.update('Merging kernel with userland...');this.model_.mergeKernelWithUserland();},this);var auditors=[];lastTask=lastTask.timedAfter('TraceImport',function createAuditorsAndRunAnnotate(){progressMeter.update('Adding arbitrary data to model...');auditors=this.importOptions_.auditorConstructors.map(function(auditorConstructor){return new auditorConstructor(this.model_);},this);auditors.forEach(function(auditor){auditor.runAnnotate();auditor.installUserFriendlyCategoryDriverIfNeeded();});},this);lastTask=lastTask.timedAfter('TraceImport',function computeWorldBounds(){progressMeter.update('Computing final world bounds...');this.model_.computeWorldBounds(this.importOptions_.shiftWorldToZero);},this);lastTask=lastTask.timedAfter('TraceImport',function buildFlowEventIntervalTree(){progressMeter.update('Building flow event map...');this.model_.buildFlowEventIntervalTree();},this);lastTask=lastTask.timedAfter('TraceImport',function joinRefs(){progressMeter.update('Joining object refs...');this.model_.joinRefs();},this);lastTask=lastTask.timedAfter('TraceImport',function cleanupUndeletedObjects(){progressMeter.update('Cleaning up undeleted objects...');this.model_.cleanupUndeletedObjects();},this);lastTask=lastTask.timedAfter('TraceImport',function sortMemoryDumps(){progressMeter.update('Sorting memory dumps...');this.model_.sortMemoryDumps();},this);lastTask=lastTask.timedAfter('TraceImport',function finalizeMemoryGraphs(){progressMeter.update('Finalizing memory dump graphs...');this.model_.finalizeMemoryGraphs();},this);lastTask=lastTask.timedAfter('TraceImport',function initializeObjects(){progressMeter.update('Initializing objects (step 2/2)...');this.model_.initializeObjects();},this);lastTask=lastTask.timedAfter('TraceImport',function buildEventIndices(){progressMeter.update('Building event indices...');this.model_.buildEventIndices();},this);lastTask=lastTask.timedAfter('TraceImport',function buildUserModel(){progressMeter.update('Building UserModel...');var userModelBuilder=new tr.importer.UserModelBuilder(this.model_);userModelBuilder.buildUserModel();},this);lastTask=lastTask.timedAfter('TraceImport',function sortExpectations(){progressMeter.update('Sorting user expectations...');this.model_.userModel.sortExpectations();},this);lastTask=lastTask.timedAfter('TraceImport',function runAudits(){progressMeter.update('Running auditors...');auditors.forEach(function(auditor){auditor.runAudit();});},this);lastTask=lastTask.timedAfter('TraceImport',function sortAlerts(){progressMeter.update('Updating alerts...');this.model_.sortAlerts();},this);lastTask=lastTask.timedAfter('TraceImport',function lastUpdateBounds(){progressMeter.update('Update bounds...');this.model_.updateBounds();},this);lastTask=lastTask.timedAfter('TraceImport',function addModelWarnings(){progressMeter.update('Looking for warnings...');if(!this.model_.isTimeHighResolution){this.model_.importWarning({type:'low_resolution_timer',message:'Trace time is low resolution, trace may be unusable.',showToUser:true});}},this);lastTask.after(function(){this.importing_=false;},this);return importTask;},createImporter_:function(eventData){var importerConstructor=tr.importer.Importer.findImporterFor(eventData);if(!importerConstructor){throw new Error('Couldn\'t create an importer for the provided '+'eventData.');}
+lastTask=lastTask.timedAfter('TraceImport',function runMergeKernelWithuserland(){progressMeter.update('Merging kernel with userland...');this.model_.mergeKernelWithUserland();},this);var auditors=[];lastTask=lastTask.timedAfter('TraceImport',function createAuditorsAndRunAnnotate(){progressMeter.update('Adding arbitrary data to model...');auditors=this.importOptions_.auditorConstructors.map(function(auditorConstructor){return new auditorConstructor(this.model_);},this);auditors.forEach(function(auditor){auditor.runAnnotate();auditor.installUserFriendlyCategoryDriverIfNeeded();});},this);lastTask=lastTask.timedAfter('TraceImport',function computeWorldBounds(){progressMeter.update('Computing final world bounds...');this.model_.computeWorldBounds(this.importOptions_.shiftWorldToZero);},this);lastTask=lastTask.timedAfter('TraceImport',function buildFlowEventIntervalTree(){progressMeter.update('Building flow event map...');this.model_.buildFlowEventIntervalTree();},this);lastTask=lastTask.timedAfter('TraceImport',function joinRefs(){progressMeter.update('Joining object refs...');this.model_.joinRefs();},this);lastTask=lastTask.timedAfter('TraceImport',function cleanupUndeletedObjects(){progressMeter.update('Cleaning up undeleted objects...');this.model_.cleanupUndeletedObjects();},this);lastTask=lastTask.timedAfter('TraceImport',function sortMemoryDumps(){progressMeter.update('Sorting memory dumps...');this.model_.sortMemoryDumps();},this);lastTask=lastTask.timedAfter('TraceImport',function finalizeMemoryGraphs(){progressMeter.update('Finalizing memory dump graphs...');this.model_.finalizeMemoryGraphs();},this);lastTask=lastTask.timedAfter('TraceImport',function initializeObjects(){progressMeter.update('Initializing objects (step 2/2)...');this.model_.initializeObjects();},this);lastTask=lastTask.timedAfter('TraceImport',function buildEventIndices(){progressMeter.update('Building event indices...');this.model_.buildEventIndices();},this);lastTask=lastTask.timedAfter('TraceImport',function buildUserModel(){progressMeter.update('Building UserModel...');var userModelBuilder=new tr.importer.UserModelBuilder(this.model_);userModelBuilder.buildUserModel();},this);lastTask=lastTask.timedAfter('TraceImport',function sortExpectations(){progressMeter.update('Sorting user expectations...');this.model_.userModel.sortExpectations();},this);lastTask=lastTask.timedAfter('TraceImport',function runAudits(){progressMeter.update('Running auditors...');auditors.forEach(function(auditor){auditor.runAudit();});},this);lastTask=lastTask.timedAfter('TraceImport',function sortAlerts(){progressMeter.update('Updating alerts...');this.model_.sortAlerts();},this);lastTask=lastTask.timedAfter('TraceImport',function lastUpdateBounds(){progressMeter.update('Update bounds...');this.model_.updateBounds();},this);lastTask=lastTask.timedAfter('TraceImport',function addModelWarnings(){progressMeter.update('Looking for warnings...');if(!this.model_.isTimeHighResolution){this.model_.importWarning({type:'low_resolution_timer',message:'Trace time is low resolution, trace may be unusable.',showToUser:false});}},this);lastTask.after(function(){this.importing_=false;},this);return importTask;},createImporter_:function(eventData){var importerConstructor=tr.importer.Importer.findImporterFor(eventData);if(!importerConstructor){throw new Error('Couldn\'t create an importer for the provided '+'eventData.');}
 return new importerConstructor(this.model_,eventData);},hasEventDataDecoder_:function(importers){for(var i=0;i<importers.length;++i){if(!importers[i].isTraceDataContainer())
 return true;}
 return false;}};return{ImportOptions:ImportOptions,Import:Import};});'use strict';tr.exportTo('tr.e.cc',function(){function PictureAsImageData(picture,errorOrImageData){this.picture_=picture;if(errorOrImageData instanceof ImageData){this.error_=undefined;this.imageData_=errorOrImageData;}else{this.error_=errorOrImageData;this.imageData_=undefined;}};PictureAsImageData.Pending=function(picture){return new PictureAsImageData(picture,undefined);};PictureAsImageData.prototype={get picture(){return this.picture_;},get error(){return this.error_;},get imageData(){return this.imageData_;},isPending:function(){return this.error_===undefined&&this.imageData_===undefined;},asCanvas:function(){if(!this.imageData_)
--- a/src/archive/tar/writer.go
+++ b/src/archive/tar/writer.go
@@ -317,7 +317,7 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) erro
 	var buf bytes.Buffer

 	// Keys are sorted before writing to body to allow deterministic output.
-	var keys []string
+	keys := make([]string, 0, len(paxHeaders))
 	for k := range paxHeaders {
 		keys = append(keys, k)
 	}
--- a/src/bytes/example_test.go
+++ b/src/bytes/example_test.go
@@ -11,6 +11,7 @@ import (
 	"io"
 	"os"
 	"sort"
+	"unicode"
 )

 func ExampleBuffer() {
@@ -83,3 +84,205 @@ func ExampleTrimPrefix() {
 	fmt.Printf("Hello%s", b)
 	// Output: Hello, world!
 }
+
+func ExampleFields() {
+	fmt.Printf("Fields are: %q", bytes.Fields([]byte("  foo bar  baz   ")))
+	// Output: Fields are: ["foo" "bar" "baz"]
+}
+
+func ExampleFieldsFunc() {
+	f := func(c rune) bool {
+		return !unicode.IsLetter(c) && !unicode.IsNumber(c)
+	}
+	fmt.Printf("Fields are: %q", bytes.FieldsFunc([]byte("  foo1;bar2,baz3..."), f))
+	// Output: Fields are: ["foo1" "bar2" "baz3"]
+}
+
+func ExampleContains() {
+	fmt.Println(bytes.Contains([]byte("seafood"), []byte("foo")))
+	fmt.Println(bytes.Contains([]byte("seafood"), []byte("bar")))
+	fmt.Println(bytes.Contains([]byte("seafood"), []byte("")))
+	fmt.Println(bytes.Contains([]byte(""), []byte("")))
+	// Output:
+	// true
+	// false
+	// true
+	// true
+}
+
+func ExampleCount() {
+	fmt.Println(bytes.Count([]byte("cheese"), []byte("e")))
+	fmt.Println(bytes.Count([]byte("five"), []byte(""))) // before & after each rune
+	// Output:
+	// 3
+	// 5
+}
+
+func ExampleEqualFold() {
+	fmt.Println(bytes.EqualFold([]byte("Go"), []byte("go")))
+	// Output: true
+}
+
+func ExampleHasPrefix() {
+	fmt.Println(bytes.HasPrefix([]byte("Gopher"), []byte("Go")))
+	fmt.Println(bytes.HasPrefix([]byte("Gopher"), []byte("C")))
+	fmt.Println(bytes.HasPrefix([]byte("Gopher"), []byte("")))
+	// Output:
+	// true
+	// false
+	// true
+}
+
+func ExampleHasSuffix() {
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("go")))
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("O")))
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("Ami")))
+	fmt.Println(bytes.HasSuffix([]byte("Amigo"), []byte("")))
+	// Output:
+	// true
+	// false
+	// false
+	// true
+}
+
+func ExampleIndex() {
+	fmt.Println(bytes.Index([]byte("chicken"), []byte("ken")))
+	fmt.Println(bytes.Index([]byte("chicken"), []byte("dmr")))
+	// Output:
+	// 4
+	// -1
+}
+
+func ExampleIndexFunc() {
+	f := func(c rune) bool {
+		return unicode.Is(unicode.Han, c)
+	}
+	fmt.Println(bytes.IndexFunc([]byte("Hello, 世界"), f))
+	fmt.Println(bytes.IndexFunc([]byte("Hello, world"), f))
+	// Output:
+	// 7
+	// -1
+}
+
+func ExampleIndexAny() {
+	fmt.Println(bytes.IndexAny([]byte("chicken"), "aeiouy"))
+	fmt.Println(bytes.IndexAny([]byte("crwth"), "aeiouy"))
+	// Output:
+	// 2
+	// -1
+}
+
+func ExampleIndexRune() {
+	fmt.Println(bytes.IndexRune([]byte("chicken"), 'k'))
+	fmt.Println(bytes.IndexRune([]byte("chicken"), 'd'))
+	// Output:
+	// 4
+	// -1
+}
+
+func ExampleLastIndex() {
+	fmt.Println(bytes.Index([]byte("go gopher"), []byte("go")))
+	fmt.Println(bytes.LastIndex([]byte("go gopher"), []byte("go")))
+	fmt.Println(bytes.LastIndex([]byte("go gopher"), []byte("rodent")))
+	// Output:
+	// 0
+	// 3
+	// -1
+}
+
+func ExampleJoin() {
+	s := [][]byte{[]byte("foo"), []byte("bar"), []byte("baz")}
+	fmt.Printf("%s", bytes.Join(s, []byte(", ")))
+	// Output: foo, bar, baz
+}
+
+func ExampleRepeat() {
+	fmt.Printf("ba%s", bytes.Repeat([]byte("na"), 2))
+	// Output: banana
+}
+
+func ExampleReplace() {
+	fmt.Printf("%s\n", bytes.Replace([]byte("oink oink oink"), []byte("k"), []byte("ky"), 2))
+	fmt.Printf("%s\n", bytes.Replace([]byte("oink oink oink"), []byte("oink"), []byte("moo"), -1))
+	// Output:
+	// oinky oinky oink
+	// moo moo moo
+}
+
+func ExampleSplit() {
+	fmt.Printf("%q\n", bytes.Split([]byte("a,b,c"), []byte(",")))
+	fmt.Printf("%q\n", bytes.Split([]byte("a man a plan a canal panama"), []byte("a ")))
+	fmt.Printf("%q\n", bytes.Split([]byte(" xyz "), []byte("")))
+	fmt.Printf("%q\n", bytes.Split([]byte(""), []byte("Bernardo O'Higgins")))
+	// Output:
+	// ["a" "b" "c"]
+	// ["" "man " "plan " "canal panama"]
+	// [" " "x" "y" "z" " "]
+	// [""]
+}
+
+func ExampleSplitN() {
+	fmt.Printf("%q\n", bytes.SplitN([]byte("a,b,c"), []byte(","), 2))
+	z := bytes.SplitN([]byte("a,b,c"), []byte(","), 0)
+	fmt.Printf("%q (nil = %v)\n", z, z == nil)
+	// Output:
+	// ["a" "b,c"]
+	// [] (nil = true)
+}
+
+func ExampleSplitAfter() {
+	fmt.Printf("%q\n", bytes.SplitAfter([]byte("a,b,c"), []byte(",")))
+	// Output: ["a," "b," "c"]
+}
+
+func ExampleSplitAfterN() {
+	fmt.Printf("%q\n", bytes.SplitAfterN([]byte("a,b,c"), []byte(","), 2))
+	// Output: ["a," "b,c"]
+}
+
+func ExampleTitle() {
+	fmt.Printf("%s", bytes.Title([]byte("her royal highness")))
+	// Output: Her Royal Highness
+}
+
+func ExampleToTitle() {
+	fmt.Printf("%s\n", bytes.ToTitle([]byte("loud noises")))
+	fmt.Printf("%s\n", bytes.ToTitle([]byte("хлеб")))
+	// Output:
+	// LOUD NOISES
+	// ХЛЕБ
+}
+
+func ExampleTrim() {
+	fmt.Printf("[%q]", bytes.Trim([]byte(" !!! Achtung! Achtung! !!! "), "! "))
+	// Output: ["Achtung! Achtung"]
+}
+
+func ExampleMap() {
+	rot13 := func(r rune) rune {
+		switch {
+		case r >= 'A' && r <= 'Z':
+			return 'A' + (r-'A'+13)%26
+		case r >= 'a' && r <= 'z':
+			return 'a' + (r-'a'+13)%26
+		}
+		return r
+	}
+	fmt.Printf("%s", bytes.Map(rot13, []byte("'Twas brillig and the slithy gopher...")))
+	// Output: 'Gjnf oevyyvt naq gur fyvgul tbcure...
+}
+
+func ExampleTrimSpace() {
+	fmt.Printf("%s", bytes.TrimSpace([]byte(" \t\n a lone gopher \n\t\r\n")))
+	// Output: a lone gopher
+}
+
+func ExampleToUpper() {
+	fmt.Printf("%s", bytes.ToUpper([]byte("Gopher")))
+	// Output: GOPHER
+}
+
+func ExampleToLower() {
+	fmt.Printf("%s", bytes.ToLower([]byte("Gopher")))
+	// Output: gopher
+}
--- a/src/cmd/api/goapi.go
+++ b/src/cmd/api/goapi.go
@@ -425,7 +425,7 @@ func (w *Walker) Import(name string) (*types.Package, error) {
 	w.imported[name] = &importing

 	root := w.root
-	if strings.HasPrefix(name, "golang.org/x/") {
+	if strings.HasPrefix(name, "golang_org/x/") {
 		root = filepath.Join(root, "vendor")
 	}

--- a/src/cmd/asm/internal/asm/asm.go
+++ b/src/cmd/asm/internal/asm/asm.go
@@ -403,7 +403,7 @@ func (p *Parser) asmJump(op obj.As, cond string, a []obj.Addr) {

 		fallthrough
 	default:
-		p.errorf("wrong number of arguments to %s instruction", obj.Aconv(op))
+		p.errorf("wrong number of arguments to %s instruction", op)
 		return
 	}
 	switch {
@@ -476,7 +476,7 @@ func (p *Parser) branch(jmp, target *obj.Prog) {
 // asmInstruction assembles an instruction.
 // MOVW R9, (R10)
 func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
-	// fmt.Printf("%s %+v\n", obj.Aconv(op), a)
+	// fmt.Printf("%s %+v\n", op, a)
 	prog := &obj.Prog{
 		Ctxt:   p.ctxt,
 		Lineno: p.histLineNum,
@@ -525,7 +525,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 					prog.To = a[1]
 					break
 				}
-				p.errorf("unrecognized addressing for %s", obj.Aconv(op))
+				p.errorf("unrecognized addressing for %s", op)
 				return
 			}
 			if arch.IsARMFloatCmp(op) {
@@ -572,7 +572,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			// Catch missing operand here, because we store immediate as part of From3, and can't distinguish
 			// missing operand from legal value 0 in obj/x86/asm6.
 			if arch.IsAMD4OP(op) {
-				p.errorf("4 operands required, but only 3 are provided for %s instruction", obj.Aconv(op))
+				p.errorf("4 operands required, but only 3 are provided for %s instruction", op)
 			}
 			prog.From = a[0]
 			prog.From3 = newAddr(a[1])
@@ -583,7 +583,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 				prog.From = a[0]
 				prog.To = a[1]
 				if a[2].Type != obj.TYPE_REG {
-					p.errorf("invalid addressing modes for third operand to %s instruction, must be register", obj.Aconv(op))
+					p.errorf("invalid addressing modes for third operand to %s instruction, must be register", op)
 					return
 				}
 				prog.RegTo2 = a[2].Reg
@@ -619,7 +619,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 				prog.From3 = newAddr(a[1])
 				prog.To = a[2]
 			default:
-				p.errorf("invalid addressing modes for %s instruction", obj.Aconv(op))
+				p.errorf("invalid addressing modes for %s instruction", op)
 				return
 			}
 		case sys.S390X:
@@ -656,10 +656,10 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.From = a[1]
 			prog.From3 = newAddr(a[2])
 			if a[0].Type != obj.TYPE_CONST {
-				p.errorf("first operand must be an immediate in %s instruction", obj.Aconv(op))
+				p.errorf("first operand must be an immediate in %s instruction", op)
 			}
 			if prog.From3.Type != obj.TYPE_REG {
-				p.errorf("third operand must be a register in %s instruction", obj.Aconv(op))
+				p.errorf("third operand must be a register in %s instruction", op)
 			}
 			prog.From3.Offset = int64(p.getImmediate(prog, op, &a[0]))
 			prog.To = a[3]
@@ -690,7 +690,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.To = a[3]
 			break
 		}
-		p.errorf("can't handle %s instruction with 4 operands", obj.Aconv(op))
+		p.errorf("can't handle %s instruction with 4 operands", op)
 		return
 	case 5:
 		if p.arch.Family == sys.PPC64 && arch.IsPPC64RLD(op) {
@@ -712,7 +712,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.To = a[4]
 			break
 		}
-		p.errorf("can't handle %s instruction with 5 operands", obj.Aconv(op))
+		p.errorf("can't handle %s instruction with 5 operands", op)
 		return
 	case 6:
 		if p.arch.Family == sys.ARM && arch.IsARMMRC(op) {
@@ -736,7 +736,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 		}
 		fallthrough
 	default:
-		p.errorf("can't handle %s instruction with %d operands", obj.Aconv(op), len(a))
+		p.errorf("can't handle %s instruction with %d operands", op, len(a))
 		return
 	}

@@ -771,7 +771,7 @@ func (p *Parser) getConstantPseudo(pseudo string, addr *obj.Addr) int64 {
 // getConstant checks that addr represents a plain constant and returns its value.
 func (p *Parser) getConstant(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 	if addr.Type != obj.TYPE_MEM || addr.Name != 0 || addr.Reg != 0 || addr.Index != 0 {
-		p.errorf("%s: expected integer constant; found %s", obj.Aconv(op), obj.Dconv(prog, addr))
+		p.errorf("%s: expected integer constant; found %s", op, obj.Dconv(prog, addr))
 	}
 	return addr.Offset
 }
@@ -779,7 +779,7 @@ func (p *Parser) getConstant(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 // getImmediate checks that addr represents an immediate constant and returns its value.
 func (p *Parser) getImmediate(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 	if addr.Type != obj.TYPE_CONST || addr.Name != 0 || addr.Reg != 0 || addr.Index != 0 {
-		p.errorf("%s: expected immediate constant; found %s", obj.Aconv(op), obj.Dconv(prog, addr))
+		p.errorf("%s: expected immediate constant; found %s", op, obj.Dconv(prog, addr))
 	}
 	return addr.Offset
 }
@@ -787,7 +787,7 @@ func (p *Parser) getImmediate(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
 // getRegister checks that addr represents a register and returns its value.
 func (p *Parser) getRegister(prog *obj.Prog, op obj.As, addr *obj.Addr) int16 {
 	if addr.Type != obj.TYPE_REG || addr.Offset != 0 || addr.Name != 0 || addr.Index != 0 {
-		p.errorf("%s: expected register; found %s", obj.Aconv(op), obj.Dconv(prog, addr))
+		p.errorf("%s: expected register; found %s", op, obj.Dconv(prog, addr))
 	}
 	return addr.Reg
 }
--- a/src/cmd/asm/internal/asm/operand_test.go
+++ b/src/cmd/asm/internal/asm/operand_test.go
@@ -17,6 +17,7 @@ import (

 func setArch(goarch string) (*arch.Arch, *obj.Link) {
 	os.Setenv("GOOS", "linux") // obj can handle this OS for all architectures.
+	os.Setenv("GOARCH", goarch)
 	architecture := arch.Set(goarch)
 	if architecture == nil {
 		panic("asm: unrecognized architecture " + goarch)
--- a/src/cmd/cgo/gcc.go
+++ b/src/cmd/cgo/gcc.go
@@ -597,13 +597,15 @@ func (p *Package) rewriteCalls(f *File) {
 // rewriteCall rewrites one call to add pointer checks. We replace
 // each pointer argument x with _cgoCheckPointer(x).(T).
 func (p *Package) rewriteCall(f *File, call *Call, name *Name) {
+	// Avoid a crash if the number of arguments is
+	// less than the number of parameters.
+	// This will be caught when the generated file is compiled.
+	if len(call.Call.Args) < len(name.FuncType.Params) {
+		return
+	}
+
 	any := false
 	for i, param := range name.FuncType.Params {
-		if len(call.Call.Args) <= i {
-			// Avoid a crash; this will be caught when the
-			// generated file is compiled.
-			return
-		}
 		if p.needsPointerCheck(f, param.Go, call.Call.Args[i]) {
 			any = true
 			break
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -156,6 +156,36 @@ func opregreg(op obj.As, dest, src int16) *obj.Prog {
 	return p
 }

+// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
+// See runtime/mkduff.go.
+func duffStart(size int64) int64 {
+	x, _ := duff(size)
+	return x
+}
+func duffAdj(size int64) int64 {
+	_, x := duff(size)
+	return x
+}
+
+// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
+// required to use the duffzero mechanism for a block of the given size.
+func duff(size int64) (int64, int64) {
+	if size < 32 || size > 1024 || size%dzClearStep != 0 {
+		panic("bad duffzero size")
+	}
+	steps := size / dzClearStep
+	blocks := steps / dzBlockLen
+	steps %= dzBlockLen
+	off := dzBlockSize * (dzBlocks - blocks)
+	var adj int64
+	if steps != 0 {
+		off -= dzAddSize
+		off -= dzMovSize * steps
+		adj -= dzClearStep * (dzBlockLen - steps)
+	}
+	return off, adj
+}
+
 func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	s.SetLineno(v.Line)
 	switch v.Op {
@@ -209,89 +239,87 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		}
 		opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1]))

-	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW,
-		ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU,
-		ssa.OpAMD64MODQ, ssa.OpAMD64MODL, ssa.OpAMD64MODW,
-		ssa.OpAMD64MODQU, ssa.OpAMD64MODLU, ssa.OpAMD64MODWU:
+	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
+		// Arg[0] (the dividend) is in AX.
+		// Arg[1] (the divisor) can be in any other register.
+		// Result[0] (the quotient) is in AX.
+		// Result[1] (the remainder) is in DX.
+		r := gc.SSARegNum(v.Args[1])

-		// Arg[0] is already in AX as it's the only register we allow
-		// and AX is the only output
-		x := gc.SSARegNum(v.Args[1])
-
-		// CPU faults upon signed overflow, which occurs when most
-		// negative int is divided by -1.
-		var j *obj.Prog
-		if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL ||
-			v.Op == ssa.OpAMD64DIVW || v.Op == ssa.OpAMD64MODQ ||
-			v.Op == ssa.OpAMD64MODL || v.Op == ssa.OpAMD64MODW {
-
-			var c *obj.Prog
-			switch v.Op {
-			case ssa.OpAMD64DIVQ, ssa.OpAMD64MODQ:
-				c = gc.Prog(x86.ACMPQ)
-				j = gc.Prog(x86.AJEQ)
-				// go ahead and sign extend to save doing it later
-				gc.Prog(x86.ACQO)
-
-			case ssa.OpAMD64DIVL, ssa.OpAMD64MODL:
-				c = gc.Prog(x86.ACMPL)
-				j = gc.Prog(x86.AJEQ)
-				gc.Prog(x86.ACDQ)
-
-			case ssa.OpAMD64DIVW, ssa.OpAMD64MODW:
-				c = gc.Prog(x86.ACMPW)
-				j = gc.Prog(x86.AJEQ)
-				gc.Prog(x86.ACWD)
-			}
-			c.From.Type = obj.TYPE_REG
-			c.From.Reg = x
-			c.To.Type = obj.TYPE_CONST
-			c.To.Offset = -1
-
-			j.To.Type = obj.TYPE_BRANCH
-
-		}
-
-		// for unsigned ints, we sign extend by setting DX = 0
-		// signed ints were sign extended above
-		if v.Op == ssa.OpAMD64DIVQU || v.Op == ssa.OpAMD64MODQU ||
-			v.Op == ssa.OpAMD64DIVLU || v.Op == ssa.OpAMD64MODLU ||
-			v.Op == ssa.OpAMD64DIVWU || v.Op == ssa.OpAMD64MODWU {
-			c := gc.Prog(x86.AXORQ)
-			c.From.Type = obj.TYPE_REG
-			c.From.Reg = x86.REG_DX
-			c.To.Type = obj.TYPE_REG
-			c.To.Reg = x86.REG_DX
-		}
+		// Zero extend dividend.
+		c := gc.Prog(x86.AXORL)
+		c.From.Type = obj.TYPE_REG
+		c.From.Reg = x86.REG_DX
+		c.To.Type = obj.TYPE_REG
+		c.To.Reg = x86.REG_DX

+		// Issue divide.
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = x
+		p.From.Reg = r

-		// signed division, rest of the check for -1 case
-		if j != nil {
-			j2 := gc.Prog(obj.AJMP)
-			j2.To.Type = obj.TYPE_BRANCH
+	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
+		// Arg[0] (the dividend) is in AX.
+		// Arg[1] (the divisor) can be in any other register.
+		// Result[0] (the quotient) is in AX.
+		// Result[1] (the remainder) is in DX.
+		r := gc.SSARegNum(v.Args[1])

-			var n *obj.Prog
-			if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL ||
-				v.Op == ssa.OpAMD64DIVW {
-				// n * -1 = -n
-				n = gc.Prog(x86.ANEGQ)
-				n.To.Type = obj.TYPE_REG
-				n.To.Reg = x86.REG_AX
-			} else {
-				// n % -1 == 0
-				n = gc.Prog(x86.AXORQ)
-				n.From.Type = obj.TYPE_REG
-				n.From.Reg = x86.REG_DX
-				n.To.Type = obj.TYPE_REG
-				n.To.Reg = x86.REG_DX
-			}
-
-			j.To.Val = n
-			j2.To.Val = s.Pc()
+		// CPU faults upon signed overflow, which occurs when the most
+		// negative int is divided by -1. Handle divide by -1 as a special case.
+		var c *obj.Prog
+		switch v.Op {
+		case ssa.OpAMD64DIVQ:
+			c = gc.Prog(x86.ACMPQ)
+		case ssa.OpAMD64DIVL:
+			c = gc.Prog(x86.ACMPL)
+		case ssa.OpAMD64DIVW:
+			c = gc.Prog(x86.ACMPW)
 		}
+		c.From.Type = obj.TYPE_REG
+		c.From.Reg = r
+		c.To.Type = obj.TYPE_CONST
+		c.To.Offset = -1
+		j1 := gc.Prog(x86.AJEQ)
+		j1.To.Type = obj.TYPE_BRANCH
+
+		// Sign extend dividend.
+		switch v.Op {
+		case ssa.OpAMD64DIVQ:
+			gc.Prog(x86.ACQO)
+		case ssa.OpAMD64DIVL:
+			gc.Prog(x86.ACDQ)
+		case ssa.OpAMD64DIVW:
+			gc.Prog(x86.ACWD)
+		}
+
+		// Issue divide.
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r
+
+		// Skip over -1 fixup code.
+		j2 := gc.Prog(obj.AJMP)
+		j2.To.Type = obj.TYPE_BRANCH
+
+		// Issue -1 fixup code.
+		// n / -1 = -n
+		n1 := gc.Prog(x86.ANEGQ)
+		n1.To.Type = obj.TYPE_REG
+		n1.To.Reg = x86.REG_AX
+
+		// n % -1 == 0
+		n2 := gc.Prog(x86.AXORL)
+		n2.From.Type = obj.TYPE_REG
+		n2.From.Reg = x86.REG_DX
+		n2.To.Type = obj.TYPE_REG
+		n2.To.Reg = x86.REG_DX
+
+		// TODO(khr): issue only the -1 fixup code we need.
+		// For instance, if only the quotient is used, no point in zeroing the remainder.
+
+		j1.To.Val = n1
+		j2.To.Val = s.Pc()

 	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULW, ssa.OpAMD64HMULB,
 		ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU, ssa.OpAMD64HMULWU, ssa.OpAMD64HMULBU:
@@ -470,8 +498,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
-	case ssa.OpAMD64LEAQ:
-		p := gc.Prog(x86.ALEAQ)
+	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL:
+		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = gc.SSARegNum(v.Args[0])
 		gc.AddAux(&p.From, v)
@@ -649,10 +677,20 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
 		opregreg(v.Op.Asm(), gc.SSARegNum(v), gc.SSARegNum(v.Args[0]))
 	case ssa.OpAMD64DUFFZERO:
-		p := gc.Prog(obj.ADUFFZERO)
+		off := duffStart(v.AuxInt)
+		adj := duffAdj(v.AuxInt)
+		var p *obj.Prog
+		if adj != 0 {
+			p = gc.Prog(x86.AADDQ)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = adj
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = x86.REG_DI
+		}
+		p = gc.Prog(obj.ADUFFZERO)
 		p.To.Type = obj.TYPE_ADDR
 		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
-		p.To.Offset = v.AuxInt
+		p.To.Offset = off
 	case ssa.OpAMD64MOVOconst:
 		if v.AuxInt != 0 {
 			v.Unimplementedf("MOVOconst can only do constant=0")
@@ -665,7 +703,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg))
 		p.To.Offset = v.AuxInt

-	case ssa.OpCopy, ssa.OpAMD64MOVQconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
+	case ssa.OpCopy, ssa.OpAMD64MOVQconvert, ssa.OpAMD64MOVLconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
 		if v.Type.IsMemory() {
 			return
 		}
@@ -714,27 +752,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Name = obj.NAME_AUTO
 		}
 	case ssa.OpPhi:
-		// just check to make sure regalloc and stackalloc did it right
-		if v.Type.IsMemory() {
-			return
-		}
-		f := v.Block.Func
-		loc := f.RegAlloc[v.ID]
-		for _, a := range v.Args {
-			if aloc := f.RegAlloc[a.ID]; aloc != loc { // TODO: .Equal() instead?
-				v.Fatalf("phi arg at different location than phi: %v @ %v, but arg %v @ %v\n%s\n", v, loc, a, aloc, v.Block.Func)
-			}
-		}
+		gc.CheckLoweredPhi(v)
 	case ssa.OpInitMem:
 		// memory arg needs no code
 	case ssa.OpArg:
 		// input args need no code
 	case ssa.OpAMD64LoweredGetClosurePtr:
-		// Output is hardwired to DX only,
-		// and DX contains the closure pointer on
-		// closure entry, and this "instruction"
-		// is scheduled to the very beginning
-		// of the entry block.
+		// Closure pointer is DX.
+		gc.CheckLoweredGetClosurePtr(v)
 	case ssa.OpAMD64LoweredGetG:
 		r := gc.SSARegNum(v)
 		// See the comments in cmd/internal/obj/x86/obj6.go
@@ -831,6 +856,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = gc.SSARegNum(v)
 	case ssa.OpSP, ssa.OpSB:
 		// nothing to do
+	case ssa.OpSelect0, ssa.OpSelect1:
+		// nothing to do
 	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
 		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
 		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
--- a/src/cmd/compile/internal/arm/peep.go
+++ b/src/cmd/compile/internal/arm/peep.go
@@ -538,7 +538,7 @@ gotit:
 	}

 	if gc.Debug['P'] != 0 {
-		fmt.Printf(" => %v\n", obj.Aconv(p.As))
+		fmt.Printf(" => %v\n", p.As)
 	}
 	return true
 }
@@ -1036,7 +1036,7 @@ func xtramodes(g *gc.Graph, r *gc.Flow, a *obj.Addr) bool {
 func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case arm.AMOVM:
--- a/src/cmd/compile/internal/arm/prog.go
+++ b/src/cmd/compile/internal/arm/prog.go
@@ -79,6 +79,8 @@ var progtable = [arm.ALAST & obj.AMask]obj.ProgInfo{
 	arm.AMULF & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | RightRdwr},
 	arm.ASUBD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | RightRdwr},
 	arm.ASUBF & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | RightRdwr},
+	arm.ANEGD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | RightRdwr},
+	arm.ANEGF & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | RightRdwr},
 	arm.ASQRTD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | RightRdwr},

 	// Conversions.
--- a/src/cmd/compile/internal/arm/ssa.go
+++ b/src/cmd/compile/internal/arm/ssa.go
--- a/src/cmd/compile/internal/arm64/galign.go
+++ b/src/cmd/compile/internal/arm64/galign.go
@@ -6,6 +6,7 @@ package arm64

 import (
 	"cmd/compile/internal/gc"
+	"cmd/compile/internal/ssa"
 	"cmd/internal/obj/arm64"
 )

@@ -61,6 +62,11 @@ func Main() {
 	gc.Thearch.Doregbits = doregbits
 	gc.Thearch.Regnames = regnames

+	gc.Thearch.SSARegToReg = ssaRegToReg
+	gc.Thearch.SSAMarkMoves = func(s *gc.SSAGenState, b *ssa.Block) {}
+	gc.Thearch.SSAGenValue = ssaGenValue
+	gc.Thearch.SSAGenBlock = ssaGenBlock
+
 	gc.Main()
 	gc.Exit(0)
 }
--- a/src/cmd/compile/internal/arm64/peep.go
+++ b/src/cmd/compile/internal/arm64/peep.go
@@ -162,7 +162,7 @@ loop1:
 			continue
 		}
 		if gc.Debug['P'] != 0 {
-			fmt.Printf("encoding $%d directly into %v in:\n%v\n%v\n", p.From.Offset, obj.Aconv(p1.As), p, p1)
+			fmt.Printf("encoding $%d directly into %v in:\n%v\n%v\n", p.From.Offset, p1.As, p, p1)
 		}
 		p1.From.Type = obj.TYPE_CONST
 		p1.From = p.From
@@ -423,7 +423,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case obj.ANOP, /* read p->from, write p->to */
--- a/src/cmd/compile/internal/arm64/prog.go
+++ b/src/cmd/compile/internal/arm64/prog.go
@@ -44,24 +44,37 @@ var progtable = [arm64.ALAST & obj.AMask]obj.ProgInfo{
 	// Integer
 	arm64.AADD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASUB & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.ANEG & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ANEG & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite}, // why RegRead? revisit once the old backend gone
 	arm64.AAND & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AORR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AEOR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ABIC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AMVN & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite},
 	arm64.AMUL & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AMULW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASMULL & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AUMULL & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.ASMULH & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.AUMULH & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ASMULH & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUMULH & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASDIV & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AUDIV & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ASDIVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUDIVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AREM & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUREM & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AREMW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUREMW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ALSL & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ALSR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AASR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ACMP & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead},
+	arm64.ACMPW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead},
 	arm64.AADC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.UseCarry},
 	arm64.AROR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ARORW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AADDS & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.SetCarry},
+	arm64.ACSET & obj.AMask:  {Flags: gc.SizeQ | gc.RightWrite},
+	arm64.ACSEL & obj.AMask:  {Flags: gc.SizeQ | gc.RegRead | gc.RightWrite},

 	// Floating point.
 	arm64.AFADDD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -0,0 +1,865 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package arm64
+
+import (
+	"math"
+
+	"cmd/compile/internal/gc"
+	"cmd/compile/internal/ssa"
+	"cmd/internal/obj"
+	"cmd/internal/obj/arm64"
+)
+
+var ssaRegToReg = []int16{
+	arm64.REG_R0,
+	arm64.REG_R1,
+	arm64.REG_R2,
+	arm64.REG_R3,
+	arm64.REG_R4,
+	arm64.REG_R5,
+	arm64.REG_R6,
+	arm64.REG_R7,
+	arm64.REG_R8,
+	arm64.REG_R9,
+	arm64.REG_R10,
+	arm64.REG_R11,
+	arm64.REG_R12,
+	arm64.REG_R13,
+	arm64.REG_R14,
+	arm64.REG_R15,
+	arm64.REG_R16,
+	arm64.REG_R17,
+	arm64.REG_R18, // platform register, not used
+	arm64.REG_R19,
+	arm64.REG_R20,
+	arm64.REG_R21,
+	arm64.REG_R22,
+	arm64.REG_R23,
+	arm64.REG_R24,
+	arm64.REG_R25,
+	arm64.REG_R26,
+	// R27 = REGTMP not used in regalloc
+	arm64.REGG,    // R28
+	arm64.REG_R29, // frame pointer, not used
+	// R30 = REGLINK not used in regalloc
+	arm64.REGSP, // R31
+
+	arm64.REG_F0,
+	arm64.REG_F1,
+	arm64.REG_F2,
+	arm64.REG_F3,
+	arm64.REG_F4,
+	arm64.REG_F5,
+	arm64.REG_F6,
+	arm64.REG_F7,
+	arm64.REG_F8,
+	arm64.REG_F9,
+	arm64.REG_F10,
+	arm64.REG_F11,
+	arm64.REG_F12,
+	arm64.REG_F13,
+	arm64.REG_F14,
+	arm64.REG_F15,
+	arm64.REG_F16,
+	arm64.REG_F17,
+	arm64.REG_F18,
+	arm64.REG_F19,
+	arm64.REG_F20,
+	arm64.REG_F21,
+	arm64.REG_F22,
+	arm64.REG_F23,
+	arm64.REG_F24,
+	arm64.REG_F25,
+	arm64.REG_F26,
+	arm64.REG_F27,
+	arm64.REG_F28,
+	arm64.REG_F29,
+	arm64.REG_F30,
+	arm64.REG_F31,
+
+	arm64.REG_NZCV, // flag
+	0,              // SB isn't a real register.  We fill an Addr.Reg field with 0 in this case.
+}
+
+// Smallest possible faulting page at address zero,
+// see ../../../../runtime/mheap.go:/minPhysPageSize
+const minZeroPage = 4096
+
+// loadByType returns the load instruction of the given type.
+func loadByType(t ssa.Type) obj.As {
+	if t.IsFloat() {
+		switch t.Size() {
+		case 4:
+			return arm64.AFMOVS
+		case 8:
+			return arm64.AFMOVD
+		}
+	} else {
+		switch t.Size() {
+		case 1:
+			if t.IsSigned() {
+				return arm64.AMOVB
+			} else {
+				return arm64.AMOVBU
+			}
+		case 2:
+			if t.IsSigned() {
+				return arm64.AMOVH
+			} else {
+				return arm64.AMOVHU
+			}
+		case 4:
+			if t.IsSigned() {
+				return arm64.AMOVW
+			} else {
+				return arm64.AMOVWU
+			}
+		case 8:
+			return arm64.AMOVD
+		}
+	}
+	panic("bad load type")
+}
+
+// storeByType returns the store instruction of the given type.
+func storeByType(t ssa.Type) obj.As {
+	if t.IsFloat() {
+		switch t.Size() {
+		case 4:
+			return arm64.AFMOVS
+		case 8:
+			return arm64.AFMOVD
+		}
+	} else {
+		switch t.Size() {
+		case 1:
+			return arm64.AMOVB
+		case 2:
+			return arm64.AMOVH
+		case 4:
+			return arm64.AMOVW
+		case 8:
+			return arm64.AMOVD
+		}
+	}
+	panic("bad store type")
+}
+
+// makeshift encodes a register shifted by a constant, used as an Offset in Prog
+func makeshift(reg int16, typ int64, s int64) int64 {
+	return int64(reg&31)<<16 | typ | (s&63)<<10
+}
+
+// genshift generates a Prog for r = r0 op (r1 shifted by s)
+func genshift(as obj.As, r0, r1, r int16, typ int64, s int64) *obj.Prog {
+	p := gc.Prog(as)
+	p.From.Type = obj.TYPE_SHIFT
+	p.From.Offset = makeshift(r1, typ, s)
+	p.Reg = r0
+	if r != 0 {
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	}
+	return p
+}
+
+func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
+	s.SetLineno(v.Line)
+	switch v.Op {
+	case ssa.OpInitMem:
+		// memory arg needs no code
+	case ssa.OpArg:
+		// input args need no code
+	case ssa.OpSP, ssa.OpSB, ssa.OpGetG:
+		// nothing to do
+	case ssa.OpCopy, ssa.OpARM64MOVDconvert, ssa.OpARM64MOVDreg:
+		if v.Type.IsMemory() {
+			return
+		}
+		x := gc.SSARegNum(v.Args[0])
+		y := gc.SSARegNum(v)
+		if x == y {
+			return
+		}
+		as := arm64.AMOVD
+		if v.Type.IsFloat() {
+			switch v.Type.Size() {
+			case 4:
+				as = arm64.AFMOVS
+			case 8:
+				as = arm64.AFMOVD
+			default:
+				panic("bad float size")
+			}
+		}
+		p := gc.Prog(as)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = x
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = y
+	case ssa.OpARM64MOVDnop:
+		if gc.SSARegNum(v) != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		// nothing to do
+	case ssa.OpLoadReg:
+		if v.Type.IsFlags() {
+			v.Unimplementedf("load flags not implemented: %v", v.LongString())
+			return
+		}
+		p := gc.Prog(loadByType(v.Type))
+		n, off := gc.AutoVar(v.Args[0])
+		p.From.Type = obj.TYPE_MEM
+		p.From.Node = n
+		p.From.Sym = gc.Linksym(n.Sym)
+		p.From.Offset = off
+		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
+			p.From.Name = obj.NAME_PARAM
+			p.From.Offset += n.Xoffset
+		} else {
+			p.From.Name = obj.NAME_AUTO
+		}
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpPhi:
+		gc.CheckLoweredPhi(v)
+	case ssa.OpStoreReg:
+		if v.Type.IsFlags() {
+			v.Unimplementedf("store flags not implemented: %v", v.LongString())
+			return
+		}
+		p := gc.Prog(storeByType(v.Type))
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		n, off := gc.AutoVar(v)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Node = n
+		p.To.Sym = gc.Linksym(n.Sym)
+		p.To.Offset = off
+		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
+			p.To.Name = obj.NAME_PARAM
+			p.To.Offset += n.Xoffset
+		} else {
+			p.To.Name = obj.NAME_AUTO
+		}
+	case ssa.OpARM64ADD,
+		ssa.OpARM64SUB,
+		ssa.OpARM64AND,
+		ssa.OpARM64OR,
+		ssa.OpARM64XOR,
+		ssa.OpARM64BIC,
+		ssa.OpARM64MUL,
+		ssa.OpARM64MULW,
+		ssa.OpARM64MULH,
+		ssa.OpARM64UMULH,
+		ssa.OpARM64MULL,
+		ssa.OpARM64UMULL,
+		ssa.OpARM64DIV,
+		ssa.OpARM64UDIV,
+		ssa.OpARM64DIVW,
+		ssa.OpARM64UDIVW,
+		ssa.OpARM64MOD,
+		ssa.OpARM64UMOD,
+		ssa.OpARM64MODW,
+		ssa.OpARM64UMODW,
+		ssa.OpARM64SLL,
+		ssa.OpARM64SRL,
+		ssa.OpARM64SRA,
+		ssa.OpARM64FADDS,
+		ssa.OpARM64FADDD,
+		ssa.OpARM64FSUBS,
+		ssa.OpARM64FSUBD,
+		ssa.OpARM64FMULS,
+		ssa.OpARM64FMULD,
+		ssa.OpARM64FDIVS,
+		ssa.OpARM64FDIVD:
+		r := gc.SSARegNum(v)
+		r1 := gc.SSARegNum(v.Args[0])
+		r2 := gc.SSARegNum(v.Args[1])
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r2
+		p.Reg = r1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	case ssa.OpARM64ADDconst,
+		ssa.OpARM64SUBconst,
+		ssa.OpARM64ANDconst,
+		ssa.OpARM64ORconst,
+		ssa.OpARM64XORconst,
+		ssa.OpARM64BICconst,
+		ssa.OpARM64SLLconst,
+		ssa.OpARM64SRLconst,
+		ssa.OpARM64SRAconst,
+		ssa.OpARM64RORconst,
+		ssa.OpARM64RORWconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64ADDshiftLL,
+		ssa.OpARM64SUBshiftLL,
+		ssa.OpARM64ANDshiftLL,
+		ssa.OpARM64ORshiftLL,
+		ssa.OpARM64XORshiftLL,
+		ssa.OpARM64BICshiftLL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm64.SHIFT_LL, v.AuxInt)
+	case ssa.OpARM64ADDshiftRL,
+		ssa.OpARM64SUBshiftRL,
+		ssa.OpARM64ANDshiftRL,
+		ssa.OpARM64ORshiftRL,
+		ssa.OpARM64XORshiftRL,
+		ssa.OpARM64BICshiftRL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm64.SHIFT_LR, v.AuxInt)
+	case ssa.OpARM64ADDshiftRA,
+		ssa.OpARM64SUBshiftRA,
+		ssa.OpARM64ANDshiftRA,
+		ssa.OpARM64ORshiftRA,
+		ssa.OpARM64XORshiftRA,
+		ssa.OpARM64BICshiftRA:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm64.SHIFT_AR, v.AuxInt)
+	case ssa.OpARM64MOVDconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64FMOVSconst,
+		ssa.OpARM64FMOVDconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_FCONST
+		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64CMP,
+		ssa.OpARM64CMPW,
+		ssa.OpARM64CMN,
+		ssa.OpARM64CMNW,
+		ssa.OpARM64FCMPS,
+		ssa.OpARM64FCMPD:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[1])
+		p.Reg = gc.SSARegNum(v.Args[0])
+	case ssa.OpARM64CMPconst,
+		ssa.OpARM64CMPWconst,
+		ssa.OpARM64CMNconst,
+		ssa.OpARM64CMNWconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.Reg = gc.SSARegNum(v.Args[0])
+	case ssa.OpARM64CMPshiftLL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), 0, arm64.SHIFT_LL, v.AuxInt)
+	case ssa.OpARM64CMPshiftRL:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), 0, arm64.SHIFT_LR, v.AuxInt)
+	case ssa.OpARM64CMPshiftRA:
+		genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), 0, arm64.SHIFT_AR, v.AuxInt)
+	case ssa.OpARM64MOVDaddr:
+		p := gc.Prog(arm64.AMOVD)
+		p.From.Type = obj.TYPE_ADDR
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+
+		var wantreg string
+		// MOVD $sym+off(base), R
+		// the assembler expands it as the following:
+		// - base is SP: add constant offset to SP (R13)
+		//               when constant is large, tmp register (R11) may be used
+		// - base is SB: load external address from constant pool (use relocation)
+		switch v.Aux.(type) {
+		default:
+			v.Fatalf("aux is of unknown type %T", v.Aux)
+		case *ssa.ExternSymbol:
+			wantreg = "SB"
+			gc.AddAux(&p.From, v)
+		case *ssa.ArgSymbol, *ssa.AutoSymbol:
+			wantreg = "SP"
+			gc.AddAux(&p.From, v)
+		case nil:
+			// No sym, just MOVD $off(SP), R
+			wantreg = "SP"
+			p.From.Reg = arm64.REGSP
+			p.From.Offset = v.AuxInt
+		}
+		if reg := gc.SSAReg(v.Args[0]); reg.Name() != wantreg {
+			v.Fatalf("bad reg %s for symbol type %T, want %s", reg.Name(), v.Aux, wantreg)
+		}
+	case ssa.OpARM64MOVBload,
+		ssa.OpARM64MOVBUload,
+		ssa.OpARM64MOVHload,
+		ssa.OpARM64MOVHUload,
+		ssa.OpARM64MOVWload,
+		ssa.OpARM64MOVWUload,
+		ssa.OpARM64MOVDload,
+		ssa.OpARM64FMOVSload,
+		ssa.OpARM64FMOVDload:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64MOVBstore,
+		ssa.OpARM64MOVHstore,
+		ssa.OpARM64MOVWstore,
+		ssa.OpARM64MOVDstore,
+		ssa.OpARM64FMOVSstore,
+		ssa.OpARM64FMOVDstore:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.To, v)
+	case ssa.OpARM64MOVBstorezero,
+		ssa.OpARM64MOVHstorezero,
+		ssa.OpARM64MOVWstorezero,
+		ssa.OpARM64MOVDstorezero:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = arm64.REGZERO
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.To, v)
+	case ssa.OpARM64MOVBreg,
+		ssa.OpARM64MOVBUreg,
+		ssa.OpARM64MOVHreg,
+		ssa.OpARM64MOVHUreg,
+		ssa.OpARM64MOVWreg,
+		ssa.OpARM64MOVWUreg:
+		a := v.Args[0]
+		for a.Op == ssa.OpCopy || a.Op == ssa.OpARM64MOVDreg {
+			a = a.Args[0]
+		}
+		if a.Op == ssa.OpLoadReg {
+			t := a.Type
+			switch {
+			case v.Op == ssa.OpARM64MOVBreg && t.Size() == 1 && t.IsSigned(),
+				v.Op == ssa.OpARM64MOVBUreg && t.Size() == 1 && !t.IsSigned(),
+				v.Op == ssa.OpARM64MOVHreg && t.Size() == 2 && t.IsSigned(),
+				v.Op == ssa.OpARM64MOVHUreg && t.Size() == 2 && !t.IsSigned(),
+				v.Op == ssa.OpARM64MOVWreg && t.Size() == 4 && t.IsSigned(),
+				v.Op == ssa.OpARM64MOVWUreg && t.Size() == 4 && !t.IsSigned():
+				// arg is a proper-typed load, already zero/sign-extended, don't extend again
+				if gc.SSARegNum(v) == gc.SSARegNum(v.Args[0]) {
+					return
+				}
+				p := gc.Prog(arm64.AMOVD)
+				p.From.Type = obj.TYPE_REG
+				p.From.Reg = gc.SSARegNum(v.Args[0])
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = gc.SSARegNum(v)
+				return
+			default:
+			}
+		}
+		fallthrough
+	case ssa.OpARM64MVN,
+		ssa.OpARM64NEG,
+		ssa.OpARM64FNEGS,
+		ssa.OpARM64FNEGD,
+		ssa.OpARM64FSQRTD,
+		ssa.OpARM64FCVTZSSW,
+		ssa.OpARM64FCVTZSDW,
+		ssa.OpARM64FCVTZUSW,
+		ssa.OpARM64FCVTZUDW,
+		ssa.OpARM64FCVTZSS,
+		ssa.OpARM64FCVTZSD,
+		ssa.OpARM64FCVTZUS,
+		ssa.OpARM64FCVTZUD,
+		ssa.OpARM64SCVTFWS,
+		ssa.OpARM64SCVTFWD,
+		ssa.OpARM64SCVTFS,
+		ssa.OpARM64SCVTFD,
+		ssa.OpARM64UCVTFWS,
+		ssa.OpARM64UCVTFWD,
+		ssa.OpARM64UCVTFS,
+		ssa.OpARM64UCVTFD,
+		ssa.OpARM64FCVTSD,
+		ssa.OpARM64FCVTDS:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64CSELULT,
+		ssa.OpARM64CSELULT0:
+		r1 := int16(arm64.REGZERO)
+		if v.Op == ssa.OpARM64CSELULT {
+			r1 = gc.SSARegNum(v.Args[1])
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG // assembler encodes conditional bits in Reg
+		p.From.Reg = arm64.COND_LO
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.From3 = &obj.Addr{Type: obj.TYPE_REG, Reg: r1}
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64DUFFZERO:
+		// runtime.duffzero expects start address - 8 in R16
+		p := gc.Prog(arm64.ASUB)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 8
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REG_R16
+		p = gc.Prog(obj.ADUFFZERO)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
+		p.To.Offset = v.AuxInt
+	case ssa.OpARM64LoweredZero:
+		// MOVD.P	ZR, 8(R16)
+		// CMP	Rarg1, R16
+		// BLE	-2(PC)
+		// arg1 is the address of the last element to zero
+		// auxint is alignment
+		var sz int64
+		var mov obj.As
+		switch {
+		case v.AuxInt%8 == 0:
+			sz = 8
+			mov = arm64.AMOVD
+		case v.AuxInt%4 == 0:
+			sz = 4
+			mov = arm64.AMOVW
+		case v.AuxInt%2 == 0:
+			sz = 2
+			mov = arm64.AMOVH
+		default:
+			sz = 1
+			mov = arm64.AMOVB
+		}
+		p := gc.Prog(mov)
+		p.Scond = arm64.C_XPOST
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = arm64.REGZERO
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = arm64.REG_R16
+		p.To.Offset = sz
+		p2 := gc.Prog(arm64.ACMP)
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = gc.SSARegNum(v.Args[1])
+		p2.Reg = arm64.REG_R16
+		p3 := gc.Prog(arm64.ABLE)
+		p3.To.Type = obj.TYPE_BRANCH
+		gc.Patch(p3, p)
+	case ssa.OpARM64LoweredMove:
+		// MOVD.P	8(R16), Rtmp
+		// MOVD.P	Rtmp, 8(R17)
+		// CMP	Rarg2, R16
+		// BLE	-3(PC)
+		// arg2 is the address of the last element of src
+		// auxint is alignment
+		var sz int64
+		var mov obj.As
+		switch {
+		case v.AuxInt%8 == 0:
+			sz = 8
+			mov = arm64.AMOVD
+		case v.AuxInt%4 == 0:
+			sz = 4
+			mov = arm64.AMOVW
+		case v.AuxInt%2 == 0:
+			sz = 2
+			mov = arm64.AMOVH
+		default:
+			sz = 1
+			mov = arm64.AMOVB
+		}
+		p := gc.Prog(mov)
+		p.Scond = arm64.C_XPOST
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = arm64.REG_R16
+		p.From.Offset = sz
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+		p2 := gc.Prog(mov)
+		p2.Scond = arm64.C_XPOST
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = arm64.REGTMP
+		p2.To.Type = obj.TYPE_MEM
+		p2.To.Reg = arm64.REG_R17
+		p2.To.Offset = sz
+		p3 := gc.Prog(arm64.ACMP)
+		p3.From.Type = obj.TYPE_REG
+		p3.From.Reg = gc.SSARegNum(v.Args[2])
+		p3.Reg = arm64.REG_R16
+		p4 := gc.Prog(arm64.ABLE)
+		p4.To.Type = obj.TYPE_BRANCH
+		gc.Patch(p4, p)
+	case ssa.OpARM64CALLstatic:
+		if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym {
+			// Deferred calls will appear to be returning to
+			// the CALL deferreturn(SB) that we are about to emit.
+			// However, the stack trace code will show the line
+			// of the instruction byte before the return PC.
+			// To avoid that being an unrelated instruction,
+			// insert an actual hardware NOP that will have the right line number.
+			// This is different from obj.ANOP, which is a virtual no-op
+			// that doesn't make it into the instruction stream.
+			ginsnop()
+		}
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym))
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLclosure:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Offset = 0
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLdefer:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Deferproc.Sym)
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLgo:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Newproc.Sym)
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64CALLinter:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Offset = 0
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.OpARM64LoweredNilCheck:
+		// Optimization - if the subsequent block has a load or store
+		// at the same address, we don't need to issue this instruction.
+		mem := v.Args[1]
+		for _, w := range v.Block.Succs[0].Block().Values {
+			if w.Op == ssa.OpPhi {
+				if w.Type.IsMemory() {
+					mem = w
+				}
+				continue
+			}
+			if len(w.Args) == 0 || !w.Args[len(w.Args)-1].Type.IsMemory() {
+				// w doesn't use a store - can't be a memory op.
+				continue
+			}
+			if w.Args[len(w.Args)-1] != mem {
+				v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w)
+			}
+			switch w.Op {
+			case ssa.OpARM64MOVBload, ssa.OpARM64MOVBUload, ssa.OpARM64MOVHload, ssa.OpARM64MOVHUload,
+				ssa.OpARM64MOVWload, ssa.OpARM64MOVWUload, ssa.OpARM64MOVDload,
+				ssa.OpARM64FMOVSload, ssa.OpARM64FMOVDload,
+				ssa.OpARM64MOVBstore, ssa.OpARM64MOVHstore, ssa.OpARM64MOVWstore, ssa.OpARM64MOVDstore,
+				ssa.OpARM64FMOVSstore, ssa.OpARM64FMOVDstore:
+				// arg0 is ptr, auxint is offset
+				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			case ssa.OpARM64DUFFZERO, ssa.OpARM64LoweredZero:
+				// arg0 is ptr
+				if w.Args[0] == v.Args[0] {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			case ssa.OpARM64LoweredMove:
+				// arg0 is dst ptr, arg1 is src ptr
+				if w.Args[0] == v.Args[0] || w.Args[1] == v.Args[0] {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			default:
+			}
+			if w.Type.IsMemory() {
+				if w.Op == ssa.OpVarDef || w.Op == ssa.OpVarKill || w.Op == ssa.OpVarLive {
+					// these ops are OK
+					mem = w
+					continue
+				}
+				// We can't delay the nil check past the next store.
+				break
+			}
+		}
+		// Issue a load which will fault if arg is nil.
+		p := gc.Prog(arm64.AMOVB)
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+		if gc.Debug_checknil != 0 && v.Line > 1 { // v.Line==1 in generated wrappers
+			gc.Warnl(v.Line, "generated nil check")
+		}
+	case ssa.OpVarDef:
+		gc.Gvardef(v.Aux.(*gc.Node))
+	case ssa.OpVarKill:
+		gc.Gvarkill(v.Aux.(*gc.Node))
+	case ssa.OpVarLive:
+		gc.Gvarlive(v.Aux.(*gc.Node))
+	case ssa.OpKeepAlive:
+		if !v.Args[0].Type.IsPtrShaped() {
+			v.Fatalf("keeping non-pointer alive %v", v.Args[0])
+		}
+		n, off := gc.AutoVar(v.Args[0])
+		if n == nil {
+			v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0])
+		}
+		if off != 0 {
+			v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off)
+		}
+		gc.Gvarlive(n)
+	case ssa.OpARM64Equal,
+		ssa.OpARM64NotEqual,
+		ssa.OpARM64LessThan,
+		ssa.OpARM64LessEqual,
+		ssa.OpARM64GreaterThan,
+		ssa.OpARM64GreaterEqual,
+		ssa.OpARM64LessThanU,
+		ssa.OpARM64LessEqualU,
+		ssa.OpARM64GreaterThanU,
+		ssa.OpARM64GreaterEqualU:
+		// generate boolean values using CSET
+		p := gc.Prog(arm64.ACSET)
+		p.From.Type = obj.TYPE_REG // assembler encodes conditional bits in Reg
+		p.From.Reg = condBits[v.Op]
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpSelect0, ssa.OpSelect1:
+		// nothing to do
+	case ssa.OpARM64LoweredGetClosurePtr:
+		// Closure pointer is R26 (arm64.REGCTXT).
+		gc.CheckLoweredGetClosurePtr(v)
+	case ssa.OpARM64FlagEQ,
+		ssa.OpARM64FlagLT_ULT,
+		ssa.OpARM64FlagLT_UGT,
+		ssa.OpARM64FlagGT_ULT,
+		ssa.OpARM64FlagGT_UGT:
+		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
+	case ssa.OpARM64InvertFlags:
+		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
+	default:
+		v.Unimplementedf("genValue not implemented: %s", v.LongString())
+	}
+}
+
+var condBits = map[ssa.Op]int16{
+	ssa.OpARM64Equal:         arm64.COND_EQ,
+	ssa.OpARM64NotEqual:      arm64.COND_NE,
+	ssa.OpARM64LessThan:      arm64.COND_LT,
+	ssa.OpARM64LessThanU:     arm64.COND_LO,
+	ssa.OpARM64LessEqual:     arm64.COND_LE,
+	ssa.OpARM64LessEqualU:    arm64.COND_LS,
+	ssa.OpARM64GreaterThan:   arm64.COND_GT,
+	ssa.OpARM64GreaterThanU:  arm64.COND_HI,
+	ssa.OpARM64GreaterEqual:  arm64.COND_GE,
+	ssa.OpARM64GreaterEqualU: arm64.COND_HS,
+}
+
+var blockJump = map[ssa.BlockKind]struct {
+	asm, invasm obj.As
+}{
+	ssa.BlockARM64EQ:  {arm64.ABEQ, arm64.ABNE},
+	ssa.BlockARM64NE:  {arm64.ABNE, arm64.ABEQ},
+	ssa.BlockARM64LT:  {arm64.ABLT, arm64.ABGE},
+	ssa.BlockARM64GE:  {arm64.ABGE, arm64.ABLT},
+	ssa.BlockARM64LE:  {arm64.ABLE, arm64.ABGT},
+	ssa.BlockARM64GT:  {arm64.ABGT, arm64.ABLE},
+	ssa.BlockARM64ULT: {arm64.ABLO, arm64.ABHS},
+	ssa.BlockARM64UGE: {arm64.ABHS, arm64.ABLO},
+	ssa.BlockARM64UGT: {arm64.ABHI, arm64.ABLS},
+	ssa.BlockARM64ULE: {arm64.ABLS, arm64.ABHI},
+}
+
+func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
+	s.SetLineno(b.Line)
+
+	switch b.Kind {
+	case ssa.BlockPlain, ssa.BlockCall, ssa.BlockCheck:
+		if b.Succs[0].Block() != next {
+			p := gc.Prog(obj.AJMP)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		}
+
+	case ssa.BlockDefer:
+		// defer returns in R0:
+		// 0 if we should continue executing
+		// 1 if we should jump to deferreturn call
+		p := gc.Prog(arm64.ACMP)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 0
+		p.Reg = arm64.REG_R0
+		p = gc.Prog(arm64.ABNE)
+		p.To.Type = obj.TYPE_BRANCH
+		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
+		if b.Succs[0].Block() != next {
+			p := gc.Prog(obj.AJMP)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		}
+
+	case ssa.BlockExit:
+		gc.Prog(obj.AUNDEF) // tell plive.go that we never reach here
+
+	case ssa.BlockRet:
+		gc.Prog(obj.ARET)
+
+	case ssa.BlockRetJmp:
+		p := gc.Prog(obj.ARET)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(b.Aux.(*gc.Sym))
+
+	case ssa.BlockARM64EQ, ssa.BlockARM64NE,
+		ssa.BlockARM64LT, ssa.BlockARM64GE,
+		ssa.BlockARM64LE, ssa.BlockARM64GT,
+		ssa.BlockARM64ULT, ssa.BlockARM64UGT,
+		ssa.BlockARM64ULE, ssa.BlockARM64UGE:
+		jmp := blockJump[b.Kind]
+		var p *obj.Prog
+		switch next {
+		case b.Succs[0].Block():
+			p = gc.Prog(jmp.invasm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
+		case b.Succs[1].Block():
+			p = gc.Prog(jmp.asm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		default:
+			p = gc.Prog(jmp.asm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+			q := gc.Prog(obj.AJMP)
+			q.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
+		}
+
+	default:
+		b.Unimplementedf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
+	}
+}
--- a/src/cmd/compile/internal/big/arith_test.go
+++ b/src/cmd/compile/internal/big/arith_test.go
@@ -5,6 +5,7 @@
 package big

 import (
+	"fmt"
 	"math/rand"
 	"testing"
 )
@@ -118,28 +119,22 @@ func rndV(n int) []Word {
 	return v
 }

-func benchmarkFunVV(b *testing.B, f funVV, n int) {
-	x := rndV(n)
-	y := rndV(n)
-	z := make([]Word, n)
-	b.SetBytes(int64(n * _W))
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		f(z, x, y)
+var benchSizes = []int{1, 2, 3, 4, 5, 1e1, 1e2, 1e3, 1e4, 1e5}
+
+func BenchmarkAddVV(b *testing.B) {
+	for _, n := range benchSizes {
+		x := rndV(n)
+		y := rndV(n)
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _W))
+			for i := 0; i < b.N; i++ {
+				addVV(z, x, y)
+			}
+		})
 	}
 }

-func BenchmarkAddVV_1(b *testing.B)   { benchmarkFunVV(b, addVV, 1) }
-func BenchmarkAddVV_2(b *testing.B)   { benchmarkFunVV(b, addVV, 2) }
-func BenchmarkAddVV_3(b *testing.B)   { benchmarkFunVV(b, addVV, 3) }
-func BenchmarkAddVV_4(b *testing.B)   { benchmarkFunVV(b, addVV, 4) }
-func BenchmarkAddVV_5(b *testing.B)   { benchmarkFunVV(b, addVV, 5) }
-func BenchmarkAddVV_1e1(b *testing.B) { benchmarkFunVV(b, addVV, 1e1) }
-func BenchmarkAddVV_1e2(b *testing.B) { benchmarkFunVV(b, addVV, 1e2) }
-func BenchmarkAddVV_1e3(b *testing.B) { benchmarkFunVV(b, addVV, 1e3) }
-func BenchmarkAddVV_1e4(b *testing.B) { benchmarkFunVV(b, addVV, 1e4) }
-func BenchmarkAddVV_1e5(b *testing.B) { benchmarkFunVV(b, addVV, 1e5) }
-
 type funVW func(z, x []Word, y Word) (c Word)
 type argVW struct {
 	z, x nat
@@ -236,28 +231,20 @@ func TestFunVW(t *testing.T) {
 	}
 }

-func benchmarkFunVW(b *testing.B, f funVW, n int) {
-	x := rndV(n)
-	y := rndW()
-	z := make([]Word, n)
-	b.SetBytes(int64(n * _S))
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		f(z, x, y)
+func BenchmarkAddVW(b *testing.B) {
+	for _, n := range benchSizes {
+		x := rndV(n)
+		y := rndW()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _S))
+			for i := 0; i < b.N; i++ {
+				addVW(z, x, y)
+			}
+		})
 	}
 }

-func BenchmarkAddVW_1(b *testing.B)   { benchmarkFunVW(b, addVW, 1) }
-func BenchmarkAddVW_2(b *testing.B)   { benchmarkFunVW(b, addVW, 2) }
-func BenchmarkAddVW_3(b *testing.B)   { benchmarkFunVW(b, addVW, 3) }
-func BenchmarkAddVW_4(b *testing.B)   { benchmarkFunVW(b, addVW, 4) }
-func BenchmarkAddVW_5(b *testing.B)   { benchmarkFunVW(b, addVW, 5) }
-func BenchmarkAddVW_1e1(b *testing.B) { benchmarkFunVW(b, addVW, 1e1) }
-func BenchmarkAddVW_1e2(b *testing.B) { benchmarkFunVW(b, addVW, 1e2) }
-func BenchmarkAddVW_1e3(b *testing.B) { benchmarkFunVW(b, addVW, 1e3) }
-func BenchmarkAddVW_1e4(b *testing.B) { benchmarkFunVW(b, addVW, 1e4) }
-func BenchmarkAddVW_1e5(b *testing.B) { benchmarkFunVW(b, addVW, 1e5) }
-
 type funVWW func(z, x []Word, y, r Word) (c Word)
 type argVWW struct {
 	z, x nat
@@ -382,28 +369,20 @@ func TestMulAddWWW(t *testing.T) {
 	}
 }

-func benchmarkAddMulVVW(b *testing.B, n int) {
-	x := rndV(n)
-	y := rndW()
-	z := make([]Word, n)
-	b.SetBytes(int64(n * _W))
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		addMulVVW(z, x, y)
+func BenchmarkAddMulVVW(b *testing.B) {
+	for _, n := range benchSizes {
+		x := rndV(n)
+		y := rndW()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _W))
+			for i := 0; i < b.N; i++ {
+				addMulVVW(z, x, y)
+			}
+		})
 	}
 }

-func BenchmarkAddMulVVW_1(b *testing.B)   { benchmarkAddMulVVW(b, 1) }
-func BenchmarkAddMulVVW_2(b *testing.B)   { benchmarkAddMulVVW(b, 2) }
-func BenchmarkAddMulVVW_3(b *testing.B)   { benchmarkAddMulVVW(b, 3) }
-func BenchmarkAddMulVVW_4(b *testing.B)   { benchmarkAddMulVVW(b, 4) }
-func BenchmarkAddMulVVW_5(b *testing.B)   { benchmarkAddMulVVW(b, 5) }
-func BenchmarkAddMulVVW_1e1(b *testing.B) { benchmarkAddMulVVW(b, 1e1) }
-func BenchmarkAddMulVVW_1e2(b *testing.B) { benchmarkAddMulVVW(b, 1e2) }
-func BenchmarkAddMulVVW_1e3(b *testing.B) { benchmarkAddMulVVW(b, 1e3) }
-func BenchmarkAddMulVVW_1e4(b *testing.B) { benchmarkAddMulVVW(b, 1e4) }
-func BenchmarkAddMulVVW_1e5(b *testing.B) { benchmarkAddMulVVW(b, 1e5) }
-
 func testWordBitLen(t *testing.T, fname string, f func(Word) int) {
 	for i := 0; i <= _W; i++ {
 		x := Word(1) << uint(i-1) // i == 0 => x == 0
@@ -420,23 +399,15 @@ func TestWordBitLen(t *testing.T) {
 }

 // runs b.N iterations of bitLen called on a Word containing (1 << nbits)-1.
-func benchmarkBitLenN(b *testing.B, nbits uint) {
-	testword := Word((uint64(1) << nbits) - 1)
-	for i := 0; i < b.N; i++ {
-		bitLen(testword)
+func BenchmarkBitLen(b *testing.B) {
+	// Individual bitLen tests. Numbers chosen to examine both sides
+	// of powers-of-two boundaries.
+	for _, nbits := range []uint{0, 1, 2, 3, 4, 5, 8, 9, 16, 17, 31} {
+		testword := Word((uint64(1) << nbits) - 1)
+		b.Run(fmt.Sprint(nbits), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				bitLen(testword)
+			}
+		})
 	}
 }
-
-// Individual bitLen tests. Numbers chosen to examine both sides
-// of powers-of-two boundaries.
-func BenchmarkBitLen0(b *testing.B)  { benchmarkBitLenN(b, 0) }
-func BenchmarkBitLen1(b *testing.B)  { benchmarkBitLenN(b, 1) }
-func BenchmarkBitLen2(b *testing.B)  { benchmarkBitLenN(b, 2) }
-func BenchmarkBitLen3(b *testing.B)  { benchmarkBitLenN(b, 3) }
-func BenchmarkBitLen4(b *testing.B)  { benchmarkBitLenN(b, 4) }
-func BenchmarkBitLen5(b *testing.B)  { benchmarkBitLenN(b, 5) }
-func BenchmarkBitLen8(b *testing.B)  { benchmarkBitLenN(b, 8) }
-func BenchmarkBitLen9(b *testing.B)  { benchmarkBitLenN(b, 9) }
-func BenchmarkBitLen16(b *testing.B) { benchmarkBitLenN(b, 16) }
-func BenchmarkBitLen17(b *testing.B) { benchmarkBitLenN(b, 17) }
-func BenchmarkBitLen31(b *testing.B) { benchmarkBitLenN(b, 31) }
--- a/src/cmd/compile/internal/big/decimal_test.go
+++ b/src/cmd/compile/internal/big/decimal_test.go
@@ -105,12 +105,14 @@ func TestDecimalRounding(t *testing.T) {
 	}
 }

+var sink string
+
 func BenchmarkDecimalConversion(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		for shift := -100; shift <= +100; shift++ {
 			var d decimal
 			d.init(natOne, shift)
-			d.String()
+			sink = d.String()
 		}
 	}
 }
--- a/src/cmd/compile/internal/big/doc.go
+++ b/src/cmd/compile/internal/big/doc.go
@@ -0,0 +1,99 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package big implements arbitrary-precision arithmetic (big numbers).
+The following numeric types are supported:
+
+	Int    signed integers
+	Rat    rational numbers
+	Float  floating-point numbers
+
+The zero value for an Int, Rat, or Float correspond to 0. Thus, new
+values can be declared in the usual ways and denote 0 without further
+initialization:
+
+	var x Int        // &x is an *Int of value 0
+	var r = &Rat{}   // r is a *Rat of value 0
+	y := new(Float)  // y is a *Float of value 0
+
+Alternatively, new values can be allocated and initialized with factory
+functions of the form:
+
+	func NewT(v V) *T
+
+For instance, NewInt(x) returns an *Int set to the value of the int64
+argument x, NewRat(a, b) returns a *Rat set to the fraction a/b where
+a and b are int64 values, and NewFloat(f) returns a *Float initialized
+to the float64 argument f. More flexibility is provided with explicit
+setters, for instance:
+
+	var z1 Int
+	z1.SetUint64(123)                 // z1 := 123
+	z2 := new(Rat).SetFloat64(1.2)    // z2 := 6/5
+	z3 := new(Float).SetInt(z1)       // z3 := 123.0
+
+Setters, numeric operations and predicates are represented as methods of
+the form:
+
+	func (z *T) SetV(v V) *T          // z = v
+	func (z *T) Unary(x *T) *T        // z = unary x
+	func (z *T) Binary(x, y *T) *T    // z = x binary y
+	func (x *T) Pred() P              // p = pred(x)
+
+with T one of Int, Rat, or Float. For unary and binary operations, the
+result is the receiver (usually named z in that case; see below); if it
+is one of the operands x or y it may be safely overwritten (and its memory
+reused).
+
+Arithmetic expressions are typically written as a sequence of individual
+method calls, with each call corresponding to an operation. The receiver
+denotes the result and the method arguments are the operation's operands.
+For instance, given three *Int values a, b and c, the invocation
+
+	c.Add(a, b)
+
+computes the sum a + b and stores the result in c, overwriting whatever
+value was held in c before. Unless specified otherwise, operations permit
+aliasing of parameters, so it is perfectly ok to write
+
+	sum.Add(sum, x)
+
+to accumulate values x in a sum.
+
+(By always passing in a result value via the receiver, memory use can be
+much better controlled. Instead of having to allocate new memory for each
+result, an operation can reuse the space allocated for the result value,
+and overwrite that value with the new result in the process.)
+
+Notational convention: Incoming method parameters (including the receiver)
+are named consistently in the API to clarify their use. Incoming operands
+are usually named x, y, a, b, and so on, but never z. A parameter specifying
+the result is named z (typically the receiver).
+
+For instance, the arguments for (*Int).Add are named x and y, and because
+the receiver specifies the result destination, it is called z:
+
+	func (z *Int) Add(x, y *Int) *Int
+
+Methods of this form typically return the incoming receiver as well, to
+enable simple call chaining.
+
+Methods which don't require a result value to be passed in (for instance,
+Int.Sign), simply return the result. In this case, the receiver is typically
+the first operand, named x:
+
+	func (x *Int) Sign() int
+
+Various methods support conversions between strings and corresponding
+numeric values, and vice versa: *Int, *Rat, and *Float values implement
+the Stringer interface for a (default) string representation of the value,
+but also provide SetString methods to initialize a value from a string in
+a variety of supported formats (see the respective SetString documentation).
+
+Finally, *Int, *Rat, and *Float satisfy the fmt package's Scanner interface
+for scanning and (except for *Rat) the Formatter interface for formatted
+printing.
+*/
+package big
--- a/src/cmd/compile/internal/big/float.go
+++ b/src/cmd/compile/internal/big/float.go
@@ -1008,9 +1008,9 @@ func (x *Float) Float64() (float64, Accuracy) {
 		if r.form == inf || e > emax {
 			// overflow
 			if x.neg {
-				return float64(math.Inf(-1)), Below
+				return math.Inf(-1), Below
 			}
-			return float64(math.Inf(+1)), Above
+			return math.Inf(+1), Above
 		}
 		// e <= emax

--- a/src/cmd/compile/internal/big/floatconv_test.go
+++ b/src/cmd/compile/internal/big/floatconv_test.go
@@ -290,6 +290,11 @@ func TestFloat64Text(t *testing.T) {
 		// Issue 2625.
 		{383260575764816448, 'f', 0, "383260575764816448"},
 		{383260575764816448, 'g', -1, "3.8326057576481645e+17"},
+
+		// Issue 15918.
+		{1, 'f', -10, "1"},
+		{1, 'f', -11, "1"},
+		{1, 'f', -12, "1"},
 	} {
 		// The test cases are from the strconv package which tests float64 values.
 		// When formatting values with prec = -1 (shortest representation),
--- a/src/cmd/compile/internal/big/floatexample_test.go
+++ b/src/cmd/compile/internal/big/floatexample_test.go
@@ -11,7 +11,7 @@ import (
 )

 func ExampleFloat_Add() {
-	// Operating on numbers of different precision.
+	// Operate on numbers of different precision.
 	var x, y, z big.Float
 	x.SetInt64(1000)          // x is automatically set to 64bit precision
 	y.SetFloat64(2.718281828) // y is automatically set to 53bit precision
@@ -26,8 +26,8 @@ func ExampleFloat_Add() {
 	// z = 1002.718282 (0x.faadf854p+10, prec = 32, acc = Below)
 }

-func Example_Shift() {
-	// Implementing Float "shift" by modifying the (binary) exponents directly.
+func ExampleFloat_shift() {
+	// Implement Float "shift" by modifying the (binary) exponents directly.
 	for s := -5; s <= 5; s++ {
 		x := big.NewFloat(0.5)
 		x.SetMantExp(x, x.MantExp(nil)+s) // shift x by s
--- a/src/cmd/compile/internal/big/floatmarsh.go
+++ b/src/cmd/compile/internal/big/floatmarsh.go
@@ -6,7 +6,94 @@

 package big

-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+)
+
+// Gob codec version. Permits backward-compatible changes to the encoding.
+const floatGobVersion byte = 1
+
+// GobEncode implements the gob.GobEncoder interface.
+// The Float value and all its attributes (precision,
+// rounding mode, accuracy) are marshalled.
+func (x *Float) GobEncode() ([]byte, error) {
+	if x == nil {
+		return nil, nil
+	}
+
+	// determine max. space (bytes) required for encoding
+	sz := 1 + 1 + 4 // version + mode|acc|form|neg (3+2+2+1bit) + prec
+	n := 0          // number of mantissa words
+	if x.form == finite {
+		// add space for mantissa and exponent
+		n = int((x.prec + (_W - 1)) / _W) // required mantissa length in words for given precision
+		// actual mantissa slice could be shorter (trailing 0's) or longer (unused bits):
+		// - if shorter, only encode the words present
+		// - if longer, cut off unused words when encoding in bytes
+		//   (in practice, this should never happen since rounding
+		//   takes care of it, but be safe and do it always)
+		if len(x.mant) < n {
+			n = len(x.mant)
+		}
+		// len(x.mant) >= n
+		sz += 4 + n*_S // exp + mant
+	}
+	buf := make([]byte, sz)
+
+	buf[0] = floatGobVersion
+	b := byte(x.mode&7)<<5 | byte((x.acc+1)&3)<<3 | byte(x.form&3)<<1
+	if x.neg {
+		b |= 1
+	}
+	buf[1] = b
+	binary.BigEndian.PutUint32(buf[2:], x.prec)
+
+	if x.form == finite {
+		binary.BigEndian.PutUint32(buf[6:], uint32(x.exp))
+		x.mant[len(x.mant)-n:].bytes(buf[10:]) // cut off unused trailing words
+	}
+
+	return buf, nil
+}
+
+// GobDecode implements the gob.GobDecoder interface.
+// The result is rounded per the precision and rounding mode of
+// z unless z's precision is 0, in which case z is set exactly
+// to the decoded value.
+func (z *Float) GobDecode(buf []byte) error {
+	if len(buf) == 0 {
+		// Other side sent a nil or default value.
+		*z = Float{}
+		return nil
+	}
+
+	if buf[0] != floatGobVersion {
+		return fmt.Errorf("Float.GobDecode: encoding version %d not supported", buf[0])
+	}
+
+	oldPrec := z.prec
+	oldMode := z.mode
+
+	b := buf[1]
+	z.mode = RoundingMode((b >> 5) & 7)
+	z.acc = Accuracy((b>>3)&3) - 1
+	z.form = form((b >> 1) & 3)
+	z.neg = b&1 != 0
+	z.prec = binary.BigEndian.Uint32(buf[2:])
+
+	if z.form == finite {
+		z.exp = int32(binary.BigEndian.Uint32(buf[6:]))
+		z.mant = z.mant.setBytes(buf[10:])
+	}
+
+	if oldPrec != 0 {
+		z.mode = oldMode
+		z.SetPrec(uint(oldPrec))
+	}
+
+	return nil
+}

 // MarshalText implements the encoding.TextMarshaler interface.
 // Only the Float value is marshaled (in full precision), other
--- a/src/cmd/compile/internal/big/floatmarsh_test.go
+++ b/src/cmd/compile/internal/big/floatmarsh_test.go
@@ -5,7 +5,10 @@
 package big

 import (
+	"bytes"
+	"encoding/gob"
 	"encoding/json"
+	"io"
 	"testing"
 )

@@ -23,6 +26,85 @@ var floatVals = []string{
 	"Inf",
 }

+func TestFloatGobEncoding(t *testing.T) {
+	var medium bytes.Buffer
+	enc := gob.NewEncoder(&medium)
+	dec := gob.NewDecoder(&medium)
+	for _, test := range floatVals {
+		for _, sign := range []string{"", "+", "-"} {
+			for _, prec := range []uint{0, 1, 2, 10, 53, 64, 100, 1000} {
+				for _, mode := range []RoundingMode{ToNearestEven, ToNearestAway, ToZero, AwayFromZero, ToNegativeInf, ToPositiveInf} {
+					medium.Reset() // empty buffer for each test case (in case of failures)
+					x := sign + test
+
+					var tx Float
+					_, _, err := tx.SetPrec(prec).SetMode(mode).Parse(x, 0)
+					if err != nil {
+						t.Errorf("parsing of %s (%dbits, %v) failed (invalid test case): %v", x, prec, mode, err)
+						continue
+					}
+
+					// If tx was set to prec == 0, tx.Parse(x, 0) assumes precision 64. Correct it.
+					if prec == 0 {
+						tx.SetPrec(0)
+					}
+
+					if err := enc.Encode(&tx); err != nil {
+						t.Errorf("encoding of %v (%dbits, %v) failed: %v", &tx, prec, mode, err)
+						continue
+					}
+
+					var rx Float
+					if err := dec.Decode(&rx); err != nil {
+						t.Errorf("decoding of %v (%dbits, %v) failed: %v", &tx, prec, mode, err)
+						continue
+					}
+
+					if rx.Cmp(&tx) != 0 {
+						t.Errorf("transmission of %s failed: got %s want %s", x, rx.String(), tx.String())
+						continue
+					}
+
+					if rx.Prec() != prec {
+						t.Errorf("transmission of %s's prec failed: got %d want %d", x, rx.Prec(), prec)
+					}
+
+					if rx.Mode() != mode {
+						t.Errorf("transmission of %s's mode failed: got %s want %s", x, rx.Mode(), mode)
+					}
+
+					if rx.Acc() != tx.Acc() {
+						t.Errorf("transmission of %s's accuracy failed: got %s want %s", x, rx.Acc(), tx.Acc())
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestFloatCorruptGob(t *testing.T) {
+	var buf bytes.Buffer
+	tx := NewFloat(4 / 3).SetPrec(1000).SetMode(ToPositiveInf)
+	if err := gob.NewEncoder(&buf).Encode(tx); err != nil {
+		t.Fatal(err)
+	}
+	b := buf.Bytes()
+
+	var rx Float
+	if err := gob.NewDecoder(bytes.NewReader(b)).Decode(&rx); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := gob.NewDecoder(bytes.NewReader(b[:10])).Decode(&rx); err != io.ErrUnexpectedEOF {
+		t.Errorf("got %v want EOF", err)
+	}
+
+	b[1] = 0
+	if err := gob.NewDecoder(bytes.NewReader(b)).Decode(&rx); err == nil {
+		t.Fatal("got nil want version error")
+	}
+}
+
 func TestFloatJSONEncoding(t *testing.T) {
 	for _, test := range floatVals {
 		for _, sign := range []string{"", "+", "-"} {
--- a/src/cmd/compile/internal/big/ftoa.go
+++ b/src/cmd/compile/internal/big/ftoa.go
@@ -41,8 +41,11 @@ import (
 // x.Prec() mantissa bits.
 // The prec value is ignored for the 'b' or 'p' format.
 func (x *Float) Text(format byte, prec int) string {
-	const extra = 10 // TODO(gri) determine a good/better value here
-	return string(x.Append(make([]byte, 0, prec+extra), format, prec))
+	cap := 10 // TODO(gri) determine a good/better value here
+	if prec > 0 {
+		cap += prec
+	}
+	return string(x.Append(make([]byte, 0, cap), format, prec))
 }

 // String formats x like x.Text('g', 10).
--- a/src/cmd/compile/internal/big/gcd_test.go
+++ b/src/cmd/compile/internal/big/gcd_test.go
@@ -20,13 +20,27 @@ func randInt(r *rand.Rand, size uint) *Int {
 }

 func runGCD(b *testing.B, aSize, bSize uint) {
+	b.Run("WithoutXY", func(b *testing.B) {
+		runGCDExt(b, aSize, bSize, false)
+	})
+	b.Run("WithXY", func(b *testing.B) {
+		runGCDExt(b, aSize, bSize, true)
+	})
+}
+
+func runGCDExt(b *testing.B, aSize, bSize uint, calcXY bool) {
 	b.StopTimer()
 	var r = rand.New(rand.NewSource(1234))
 	aa := randInt(r, aSize)
 	bb := randInt(r, bSize)
+	var x, y *Int
+	if calcXY {
+		x = new(Int)
+		y = new(Int)
+	}
 	b.StartTimer()
 	for i := 0; i < b.N; i++ {
-		new(Int).GCD(nil, nil, aa, bb)
+		new(Int).GCD(x, y, aa, bb)
 	}
 }

--- a/src/cmd/compile/internal/big/int.go
+++ b/src/cmd/compile/internal/big/int.go
@@ -459,11 +459,11 @@ func (z *Int) GCD(x, y, a, b *Int) *Int {
 	q := new(Int)
 	temp := new(Int)

+	r := new(Int)
 	for len(B.abs) > 0 {
-		r := new(Int)
 		q, r = q.QuoRem(A, B, r)

-		A, B = B, r
+		A, B, r = B, r, A

 		temp.Set(X)
 		X.Mul(X, q)
--- a/src/cmd/compile/internal/big/nat.go
+++ b/src/cmd/compile/internal/big/nat.go
@@ -8,7 +8,10 @@

 package big

-import "math/rand"
+import (
+	"math/rand"
+	"sync"
+)

 // An unsigned integer x of the form
 //
@@ -539,6 +542,21 @@ func (z nat) div(z2, u, v nat) (q, r nat) {
 	return
 }

+// getNat returns a nat of len n. The contents may not be zero.
+func getNat(n int) nat {
+	var z nat
+	if v := natPool.Get(); v != nil {
+		z = v.(nat)
+	}
+	return z.make(n)
+}
+
+func putNat(x nat) {
+	natPool.Put(x)
+}
+
+var natPool sync.Pool
+
 // q = (uIn-r)/v, with 0 <= r < y
 // Uses z as storage for q, and u as storage for r if possible.
 // See Knuth, Volume 2, section 4.3.1, Algorithm D.
@@ -557,7 +575,7 @@ func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
 	}
 	q = z.make(m + 1)

-	qhatv := make(nat, n+1)
+	qhatv := getNat(n + 1)
 	if alias(u, uIn) || alias(u, v) {
 		u = nil // u is an alias for uIn or v - cannot reuse
 	}
@@ -565,10 +583,11 @@ func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
 	u.clear() // TODO(gri) no need to clear if we allocated a new u

 	// D1.
+	var v1 nat
 	shift := nlz(v[n-1])
 	if shift > 0 {
 		// do not modify v, it may be used by another goroutine simultaneously
-		v1 := make(nat, n)
+		v1 = getNat(n)
 		shlVU(v1, v, shift)
 		v = v1
 	}
@@ -609,6 +628,10 @@ func (z nat) divLarge(u, uIn, v nat) (q, r nat) {

 		q[j] = qhat
 	}
+	if v1 != nil {
+		putNat(v1)
+	}
+	putNat(qhatv)

 	q = q.norm()
 	shrVU(u, u, shift)
--- a/src/cmd/compile/internal/big/nat_test.go
+++ b/src/cmd/compile/internal/big/nat_test.go
@@ -5,6 +5,7 @@
 package big

 import (
+	"fmt"
 	"runtime"
 	"strings"
 	"testing"
@@ -509,24 +510,20 @@ func TestExpNN(t *testing.T) {
 	}
 }

-func ExpHelper(b *testing.B, x, y Word) {
-	var z nat
-	for i := 0; i < b.N; i++ {
-		z.expWW(x, y)
+func BenchmarkExp3Power(b *testing.B) {
+	const x = 3
+	for _, y := range []Word{
+		0x10, 0x40, 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x40000, 0x100000, 0x400000,
+	} {
+		b.Run(fmt.Sprintf("%#x", y), func(b *testing.B) {
+			var z nat
+			for i := 0; i < b.N; i++ {
+				z.expWW(x, y)
+			}
+		})
 	}
 }

-func BenchmarkExp3Power0x10(b *testing.B)     { ExpHelper(b, 3, 0x10) }
-func BenchmarkExp3Power0x40(b *testing.B)     { ExpHelper(b, 3, 0x40) }
-func BenchmarkExp3Power0x100(b *testing.B)    { ExpHelper(b, 3, 0x100) }
-func BenchmarkExp3Power0x400(b *testing.B)    { ExpHelper(b, 3, 0x400) }
-func BenchmarkExp3Power0x1000(b *testing.B)   { ExpHelper(b, 3, 0x1000) }
-func BenchmarkExp3Power0x4000(b *testing.B)   { ExpHelper(b, 3, 0x4000) }
-func BenchmarkExp3Power0x10000(b *testing.B)  { ExpHelper(b, 3, 0x10000) }
-func BenchmarkExp3Power0x40000(b *testing.B)  { ExpHelper(b, 3, 0x40000) }
-func BenchmarkExp3Power0x100000(b *testing.B) { ExpHelper(b, 3, 0x100000) }
-func BenchmarkExp3Power0x400000(b *testing.B) { ExpHelper(b, 3, 0x400000) }
-
 func fibo(n int) nat {
 	switch n {
 	case 0:
--- a/src/cmd/compile/internal/big/natconv.go
+++ b/src/cmd/compile/internal/big/natconv.go
@@ -302,7 +302,7 @@ func (x nat) itoa(neg bool, base int) []byte {
 		}

 	} else {
-		bb, ndigits := maxPow(Word(b))
+		bb, ndigits := maxPow(b)

 		// construct table of successive squares of bb*leafSize to use in subdivisions
 		// result (table != nil) <=> (len(x) > leafSize > 0)
@@ -391,7 +391,7 @@ func (q nat) convertWords(s []byte, b Word, ndigits int, bb Word, table []diviso
 				// this appears to be faster for BenchmarkString10000Base10
 				// and smaller strings (but a bit slower for larger ones)
 				t := r / 10
-				s[i] = '0' + byte(r-t<<3-t-t) // TODO(gri) replace w/ t*10 once compiler produces better code
+				s[i] = '0' + byte(r-t*10)
 				r = t
 			}
 		}
--- a/src/cmd/compile/internal/big/natconv_test.go
+++ b/src/cmd/compile/internal/big/natconv_test.go
@@ -6,6 +6,7 @@ package big

 import (
 	"bytes"
+	"fmt"
 	"io"
 	"strings"
 	"testing"
@@ -273,101 +274,57 @@ func BenchmarkStringPiParallel(b *testing.B) {
 	})
 }

-func BenchmarkScan10Base2(b *testing.B)     { ScanHelper(b, 2, 10, 10) }
-func BenchmarkScan100Base2(b *testing.B)    { ScanHelper(b, 2, 10, 100) }
-func BenchmarkScan1000Base2(b *testing.B)   { ScanHelper(b, 2, 10, 1000) }
-func BenchmarkScan10000Base2(b *testing.B)  { ScanHelper(b, 2, 10, 10000) }
-func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }
+func BenchmarkScan(b *testing.B) {
+	const x = 10
+	for _, base := range []int{2, 8, 10, 16} {
+		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
+				b.StopTimer()
+				var z nat
+				z = z.expWW(x, y)

-func BenchmarkScan10Base8(b *testing.B)     { ScanHelper(b, 8, 10, 10) }
-func BenchmarkScan100Base8(b *testing.B)    { ScanHelper(b, 8, 10, 100) }
-func BenchmarkScan1000Base8(b *testing.B)   { ScanHelper(b, 8, 10, 1000) }
-func BenchmarkScan10000Base8(b *testing.B)  { ScanHelper(b, 8, 10, 10000) }
-func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }
+				s := z.utoa(base)
+				if t := itoa(z, base); !bytes.Equal(s, t) {
+					b.Fatalf("scanning: got %s; want %s", s, t)
+				}
+				b.StartTimer()

-func BenchmarkScan10Base10(b *testing.B)     { ScanHelper(b, 10, 10, 10) }
-func BenchmarkScan100Base10(b *testing.B)    { ScanHelper(b, 10, 10, 100) }
-func BenchmarkScan1000Base10(b *testing.B)   { ScanHelper(b, 10, 10, 1000) }
-func BenchmarkScan10000Base10(b *testing.B)  { ScanHelper(b, 10, 10, 10000) }
-func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }
-
-func BenchmarkScan10Base16(b *testing.B)     { ScanHelper(b, 16, 10, 10) }
-func BenchmarkScan100Base16(b *testing.B)    { ScanHelper(b, 16, 10, 100) }
-func BenchmarkScan1000Base16(b *testing.B)   { ScanHelper(b, 16, 10, 1000) }
-func BenchmarkScan10000Base16(b *testing.B)  { ScanHelper(b, 16, 10, 10000) }
-func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }
-
-func ScanHelper(b *testing.B, base int, x, y Word) {
-	b.StopTimer()
-	var z nat
-	z = z.expWW(x, y)
-
-	s := z.utoa(base)
-	if t := itoa(z, base); !bytes.Equal(s, t) {
-		b.Fatalf("scanning: got %s; want %s", s, t)
-	}
-	b.StartTimer()
-
-	for i := 0; i < b.N; i++ {
-		z.scan(bytes.NewReader(s), base, false)
+				for i := 0; i < b.N; i++ {
+					z.scan(bytes.NewReader(s), base, false)
+				}
+			})
+		}
 	}
 }

-func BenchmarkString10Base2(b *testing.B)     { StringHelper(b, 2, 10, 10) }
-func BenchmarkString100Base2(b *testing.B)    { StringHelper(b, 2, 10, 100) }
-func BenchmarkString1000Base2(b *testing.B)   { StringHelper(b, 2, 10, 1000) }
-func BenchmarkString10000Base2(b *testing.B)  { StringHelper(b, 2, 10, 10000) }
-func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }
+func BenchmarkString(b *testing.B) {
+	const x = 10
+	for _, base := range []int{2, 8, 10, 16} {
+		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
+				b.StopTimer()
+				var z nat
+				z = z.expWW(x, y)
+				z.utoa(base) // warm divisor cache
+				b.StartTimer()

-func BenchmarkString10Base8(b *testing.B)     { StringHelper(b, 8, 10, 10) }
-func BenchmarkString100Base8(b *testing.B)    { StringHelper(b, 8, 10, 100) }
-func BenchmarkString1000Base8(b *testing.B)   { StringHelper(b, 8, 10, 1000) }
-func BenchmarkString10000Base8(b *testing.B)  { StringHelper(b, 8, 10, 10000) }
-func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }
-
-func BenchmarkString10Base10(b *testing.B)     { StringHelper(b, 10, 10, 10) }
-func BenchmarkString100Base10(b *testing.B)    { StringHelper(b, 10, 10, 100) }
-func BenchmarkString1000Base10(b *testing.B)   { StringHelper(b, 10, 10, 1000) }
-func BenchmarkString10000Base10(b *testing.B)  { StringHelper(b, 10, 10, 10000) }
-func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }
-
-func BenchmarkString10Base16(b *testing.B)     { StringHelper(b, 16, 10, 10) }
-func BenchmarkString100Base16(b *testing.B)    { StringHelper(b, 16, 10, 100) }
-func BenchmarkString1000Base16(b *testing.B)   { StringHelper(b, 16, 10, 1000) }
-func BenchmarkString10000Base16(b *testing.B)  { StringHelper(b, 16, 10, 10000) }
-func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }
-
-func StringHelper(b *testing.B, base int, x, y Word) {
-	b.StopTimer()
-	var z nat
-	z = z.expWW(x, y)
-	z.utoa(base) // warm divisor cache
-	b.StartTimer()
-
-	for i := 0; i < b.N; i++ {
-		_ = z.utoa(base)
+				for i := 0; i < b.N; i++ {
+					_ = z.utoa(base)
+				}
+			})
+		}
 	}
 }

-func BenchmarkLeafSize0(b *testing.B)  { LeafSizeHelper(b, 10, 0) } // test without splitting
-func BenchmarkLeafSize1(b *testing.B)  { LeafSizeHelper(b, 10, 1) }
-func BenchmarkLeafSize2(b *testing.B)  { LeafSizeHelper(b, 10, 2) }
-func BenchmarkLeafSize3(b *testing.B)  { LeafSizeHelper(b, 10, 3) }
-func BenchmarkLeafSize4(b *testing.B)  { LeafSizeHelper(b, 10, 4) }
-func BenchmarkLeafSize5(b *testing.B)  { LeafSizeHelper(b, 10, 5) }
-func BenchmarkLeafSize6(b *testing.B)  { LeafSizeHelper(b, 10, 6) }
-func BenchmarkLeafSize7(b *testing.B)  { LeafSizeHelper(b, 10, 7) }
-func BenchmarkLeafSize8(b *testing.B)  { LeafSizeHelper(b, 10, 8) }
-func BenchmarkLeafSize9(b *testing.B)  { LeafSizeHelper(b, 10, 9) }
-func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
-func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
-func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
-func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
-func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
-func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
-func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
-func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths
-func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
+func BenchmarkLeafSize(b *testing.B) {
+	for n := 0; n <= 16; n++ {
+		b.Run(fmt.Sprint(n), func(b *testing.B) { LeafSizeHelper(b, 10, n) })
+	}
+	// Try some large lengths
+	for _, n := range []int{32, 64} {
+		b.Run(fmt.Sprint(n), func(b *testing.B) { LeafSizeHelper(b, 10, n) })
+	}
+}

 func LeafSizeHelper(b *testing.B, base, size int) {
 	b.StopTimer()
--- a/src/cmd/compile/internal/big/ratconv.go
+++ b/src/cmd/compile/internal/big/ratconv.go
@@ -88,6 +88,12 @@ func (z *Rat) SetString(s string) (*Rat, bool) {
 		return nil, false
 	}

+	// special-case 0 (see also issue #16176)
+	if len(z.a.abs) == 0 {
+		return z, true
+	}
+	// len(z.a.abs) > 0
+
 	// correct exponent
 	if ecorr < 0 {
 		exp += int64(ecorr)
@@ -178,7 +184,7 @@ func scanExponent(r io.ByteScanner, binExpOk bool) (exp int64, base int, err err
 			}
 			break // i > 0
 		}
-		digits = append(digits, byte(ch))
+		digits = append(digits, ch)
 	}
 	// i > 0 => we have at least one digit

--- a/src/cmd/compile/internal/big/ratconv_test.go
+++ b/src/cmd/compile/internal/big/ratconv_test.go
@@ -48,6 +48,7 @@ var setStringTests = []StringTest{
 	{"53/70893980658822810696", "53/70893980658822810696", true},
 	{"106/141787961317645621392", "53/70893980658822810696", true},
 	{"204211327800791583.81095", "4084226556015831676219/20000", true},
+	{"0e9999999999", "0", true}, // issue #16176
 	{in: "1/0"},
 }

--- a/src/cmd/compile/internal/gc/asm_test.go
+++ b/src/cmd/compile/internal/gc/asm_test.go
@@ -122,3 +122,38 @@ NextVar:
 	}
 	return out
 }
+
+// TestLineNumber checks to make sure the generated assembly has line numbers
+// see issue #16214
+func TestLineNumber(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	dir, err := ioutil.TempDir("", "TestLineNumber")
+	if err != nil {
+		t.Fatalf("could not create directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	src := filepath.Join(dir, "x.go")
+	err = ioutil.WriteFile(src, []byte(issue16214src), 0644)
+	if err != nil {
+		t.Fatalf("could not write file: %v", err)
+	}
+
+	cmd := exec.Command("go", "tool", "compile", "-S", "-o", filepath.Join(dir, "out.o"), src)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("fail to run go tool compile: %v", err)
+	}
+
+	if strings.Contains(string(out), "unknown line number") {
+		t.Errorf("line number missing in assembly:\n%s", out)
+	}
+}
+
+var issue16214src = `
+package main
+
+func Mod32(x uint32) uint32 {
+	return x % 3 // frontend rewrites it as HMUL with 2863311531, the LITERAL node has Lineno 0
+}
+`
--- a/src/cmd/compile/internal/gc/bexport.go
+++ b/src/cmd/compile/internal/gc/bexport.go
@@ -153,7 +153,13 @@ const debugFormat = false // default: false
 // TODO(gri) disable and remove once there is only one export format again
 const forceObjFileStability = true

-const exportVersion = "v0"
+// Supported export format versions.
+// TODO(gri) Make this more systematic (issue #16244).
+const (
+	exportVersion0 = "v0"
+	exportVersion1 = "v1"
+	exportVersion  = exportVersion1
+)

 // exportInlined enables the export of inlined function bodies and related
 // dependencies. The compiler should work w/o any loss of functionality with
@@ -191,6 +197,9 @@ type exporter struct {
 	written int // bytes written
 	indent  int // for p.trace
 	trace   bool
+
+	// work-around for issue #16369 only
+	nesting int // amount of "nesting" of interface types
 }

 // export writes the exportlist for localpkg to out and returns the number of bytes written.
@@ -727,6 +736,14 @@ func (p *exporter) typ(t *Type) {
 			p.paramList(sig.Params(), inlineable)
 			p.paramList(sig.Results(), inlineable)

+			// for issue #16243
+			// We make this conditional for 1.7 to avoid consistency problems
+			// with installed packages compiled with an older version.
+			// TODO(gri) Clean up after 1.7 is out (issue #16244)
+			if exportVersion == exportVersion1 {
+				p.bool(m.Nointerface)
+			}
+
 			var f *Func
 			if inlineable {
 				f = mfn.Func
@@ -776,11 +793,39 @@ func (p *exporter) typ(t *Type) {

 	case TINTER:
 		p.tag(interfaceTag)
-
 		// gc doesn't separate between embedded interfaces
 		// and methods declared explicitly with an interface
 		p.int(0) // no embedded interfaces
+
+		// Because the compiler flattens interfaces containing
+		// embedded interfaces, it is possible to create interface
+		// types that recur through an unnamed type.
+		// If trackAllTypes is disabled, such recursion is not
+		// detected, leading to a stack overflow during export
+		// (issue #16369).
+		// As a crude work-around we terminate deep recursion
+		// through interface types with an empty interface and
+		// report an error.
+		// This will catch endless recursion, but is unlikely
+		// to trigger for valid, deeply nested types given the
+		// high threshold.
+		// It would be ok to continue without reporting an error
+		// since the export format is valid. But a subsequent
+		// import would import an incorrect type. The textual
+		// exporter does not report an error but importing the
+		// resulting package will lead to a syntax error during
+		// import.
+		// TODO(gri) remove this once we have a permanent fix
+		// for the issue.
+		if p.nesting > 100 {
+			p.int(0) // 0 methods to indicate empty interface
+			yyerrorl(t.Lineno, "cannot export unnamed recursive interface")
+			break
+		}
+
+		p.nesting++
 		p.methodList(t)
+		p.nesting--

 	case TMAP:
 		p.tag(mapTag)
--- a/src/cmd/compile/internal/gc/bimport.go
+++ b/src/cmd/compile/internal/gc/bimport.go
@@ -21,8 +21,9 @@ import (
 // changes to bimport.go and bexport.go.

 type importer struct {
-	in  *bufio.Reader
-	buf []byte // reused for reading strings
+	in      *bufio.Reader
+	buf     []byte // reused for reading strings
+	version string

 	// object lists, in order of deserialization
 	strList       []string
@@ -67,8 +68,9 @@ func Import(in *bufio.Reader) {

 	// --- generic export data ---

-	if v := p.string(); v != exportVersion {
-		Fatalf("importer: unknown export data version: %s", v)
+	p.version = p.string()
+	if p.version != exportVersion0 && p.version != exportVersion1 {
+		Fatalf("importer: unknown export data version: %s", p.version)
 	}

 	// populate typList with predeclared "known" types
@@ -239,14 +241,20 @@ func (p *importer) pkg() *Pkg {
 		Fatalf("importer: package path %q for pkg index %d", path, len(p.pkgList))
 	}

+	// see importimport (export.go)
 	pkg := importpkg
 	if path != "" {
 		pkg = mkpkg(path)
 	}
 	if pkg.Name == "" {
 		pkg.Name = name
+		numImport[name]++
 	} else if pkg.Name != name {
-		Fatalf("importer: conflicting package names %s and %s for path %q", pkg.Name, name, path)
+		Yyerror("importer: conflicting package names %s and %s for path %q", pkg.Name, name, path)
+	}
+	if incannedimport == 0 && myimportpath != "" && path == myimportpath {
+		Yyerror("import %q: package depends on %q (import cycle)", importpkg.Path, path)
+		errorexit()
 	}
 	p.pkgList = append(p.pkgList, pkg)

@@ -426,10 +434,15 @@ func (p *importer) typ() *Type {
 			params := p.paramList()
 			result := p.paramList()

+			nointerface := false
+			if p.version == exportVersion1 {
+				nointerface = p.bool()
+			}
+
 			n := methodname1(newname(sym), recv[0].Right)
 			n.Type = functype(recv[0], params, result)
 			checkwidth(n.Type)
-			addmethod(sym, n.Type, tsym.Pkg, false, false)
+			addmethod(sym, n.Type, tsym.Pkg, false, nointerface)
 			p.funcList = append(p.funcList, n)
 			importlist = append(importlist, n)

--- a/src/cmd/compile/internal/gc/builtin.go
+++ b/src/cmd/compile/internal/gc/builtin.go
@@ -3,7 +3,7 @@
 package gc

 const runtimeimport = "" +
-	"cn\x00\x03v0\x01\rruntime\x00\t\x11newobject\x00\x02\x17\"\vtyp·2\x00\x00" +
+	"cn\x00\x03v1\x01\rruntime\x00\t\x11newobject\x00\x02\x17\"\vtyp·2\x00\x00" +
 	"\x01\x17:\x00\t\x13panicindex\x00\x00\x00\t\x13panicslice\x00\x00\x00\t\x15pani" +
 	"cdivide\x00\x00\x00\t\x15throwreturn\x00\x00\x00\t\x11throwinit\x00\x00\x00" +
 	"\t\x11panicwrap\x00\x05 \x00 \x00 \x00\x00\t\rgopanic\x00\x01\x1b\x00\x00\x00\x00\t\x11go" +
@@ -95,16 +95,17 @@ const runtimeimport = "" +
 	"4div\x00\x03\n\x00\n\x00\x01\n\x00\t\x11uint64div\x00\x03\x14\x00\x14\x00\x01\x14\x00\t\x0fint64" +
 	"mod\x00\x03\n\x00\n\x00\x01\n\x00\t\x11uint64mod\x00\x03\x14\x00\x14\x00\x01\x14\x00\t\x1bfloat6" +
 	"4toint64\x00\x01\x1a\x00\x01\n\x00\t\x1dfloat64touint64\x00\x01\x1a\x00\x01\x14\x00\t" +
-	"\x1bint64tofloat64\x00\x01\n\x00\x01\x1a\x00\t\x1duint64tofloat64\x00" +
-	"\x01\x14\x00\x01\x1a\x00\t\x19complex128div\x00\x04\x1e\vnum·2\x00\x00\x1e\vden·" +
-	"3\x00\x00\x02\x1e\vquo·1\x00\x00\t\x19racefuncenter\x00\x01\x16d\x00\t\x17race" +
-	"funcexit\x00\x00\x00\t\x0fraceread\x00\x01\x16d\x00\t\x11racewrite\x00\x01\x16" +
-	"d\x00\t\x19racereadrange\x00\x04\x16\raddr·1\x00d\x16\rsize·2\x00" +
-	"d\x00\t\x1bracewriterange\x00\x04\x16\x94\x03\x00d\x16\x96\x03\x00d\x00\t\x0fmsanrea" +
-	"d\x00\x04\x16\x94\x03\x00d\x16\x96\x03\x00d\x00\t\x11msanwrite\x00\x04\x16\x94\x03\x00d\x16\x96\x03\x00d\x00\v\xf4" +
-	"\x01\x02\v\x00\x01\x00\n$$\n"
+	"\x1dfloat64touint32\x00\x01\x1a\x00\x01\x12\x00\t\x1bint64tofloat64\x00" +
+	"\x01\n\x00\x01\x1a\x00\t\x1duint64tofloat64\x00\x01\x14\x00\x01\x1a\x00\t\x1duint32to" +
+	"float64\x00\x01\x12\x00\x01\x1a\x00\t\x19complex128div\x00\x04\x1e\vnum·2\x00" +
+	"\x00\x1e\vden·3\x00\x00\x02\x1e\vquo·1\x00\x00\t\x19racefuncenter\x00\x01\x16" +
+	"d\x00\t\x17racefuncexit\x00\x00\x00\t\x0fraceread\x00\x01\x16d\x00\t\x11race" +
+	"write\x00\x01\x16d\x00\t\x19racereadrange\x00\x04\x16\raddr·1\x00d\x16\r" +
+	"size·2\x00d\x00\t\x1bracewriterange\x00\x04\x16\x98\x03\x00d\x16\x9a\x03\x00d\x00\t" +
+	"\x0fmsanread\x00\x04\x16\x98\x03\x00d\x16\x9a\x03\x00d\x00\t\x11msanwrite\x00\x04\x16\x98\x03\x00d" +
+	"\x16\x9a\x03\x00d\x00\v\xf8\x01\x02\v\x00\x01\x00\n$$\n"

 const unsafeimport = "" +
-	"cn\x00\x03v0\x01\vunsafe\x00\x05\r\rPointer\x00\x16\x00\t\x0fOffsetof\x00\x01" +
+	"cn\x00\x03v1\x01\vunsafe\x00\x05\r\rPointer\x00\x16\x00\t\x0fOffsetof\x00\x01" +
 	":\x00\x01\x16\x00\t\vSizeof\x00\x01:\x00\x01\x16\x00\t\rAlignof\x00\x01:\x00\x01\x16\x00\v\b\x00\v" +
 	"\x00\x01\x00\n$$\n"
--- a/src/cmd/compile/internal/gc/builtin/runtime.go
+++ b/src/cmd/compile/internal/gc/builtin/runtime.go
@@ -150,8 +150,10 @@ func int64mod(int64, int64) int64
 func uint64mod(uint64, uint64) uint64
 func float64toint64(float64) int64
 func float64touint64(float64) uint64
+func float64touint32(float64) uint32
 func int64tofloat64(int64) float64
 func uint64tofloat64(uint64) float64
+func uint32tofloat64(uint32) float64

 func complex128div(num complex128, den complex128) (quo complex128)

--- a/src/cmd/compile/internal/gc/closure.go
+++ b/src/cmd/compile/internal/gc/closure.go
@@ -166,7 +166,7 @@ func closurename(n *Node) *Sym {
 	prefix := ""
 	if n.Func.Outerfunc == nil {
 		// Global closure.
-		outer = "glob"
+		outer = "glob."

 		prefix = "func"
 		closurename_closgen++
--- a/src/cmd/compile/internal/gc/const.go
+++ b/src/cmd/compile/internal/gc/const.go
@@ -61,6 +61,34 @@ func (v Val) Ctype() Ctype {
 	}
 }

+func eqval(a, b Val) bool {
+	if a.Ctype() != b.Ctype() {
+		return false
+	}
+	switch x := a.U.(type) {
+	default:
+		Fatalf("unexpected Ctype for %T", a.U)
+		panic("not reached")
+	case *NilVal:
+		return true
+	case bool:
+		y := b.U.(bool)
+		return x == y
+	case *Mpint:
+		y := b.U.(*Mpint)
+		return x.Cmp(y) == 0
+	case *Mpflt:
+		y := b.U.(*Mpflt)
+		return x.Cmp(y) == 0
+	case *Mpcplx:
+		y := b.U.(*Mpcplx)
+		return x.Real.Cmp(&y.Real) == 0 && x.Imag.Cmp(&y.Imag) == 0
+	case string:
+		y := b.U.(string)
+		return x == y
+	}
+}
+
 type NilVal struct{}

 // IntLiteral returns the Node's literal value as an integer.
--- a/src/cmd/compile/internal/gc/constFold_test.go
+++ b/src/cmd/compile/internal/gc/constFold_test.go
--- a/src/cmd/compile/internal/gc/esc.go
+++ b/src/cmd/compile/internal/gc/esc.go
@@ -1551,10 +1551,12 @@ func esccall(e *EscState, n *Node, up *Node) {
 	}

 	var src *Node
+	note := ""
 	i := 0
 	lls := ll.Slice()
 	for t, it := IterFields(fntype.Params()); i < len(lls); i++ {
 		src = lls[i]
+		note = t.Note
 		if t.Isddd && !n.Isddd {
 			// Introduce ODDDARG node to represent ... allocation.
 			src = Nod(ODDDARG, nil, nil)
@@ -1566,7 +1568,7 @@ func esccall(e *EscState, n *Node, up *Node) {
 		}

 		if haspointers(t.Type) {
-			if escassignfromtag(e, t.Note, nE.Escretval, src) == EscNone && up.Op != ODEFER && up.Op != OPROC {
+			if escassignfromtag(e, note, nE.Escretval, src) == EscNone && up.Op != ODEFER && up.Op != OPROC {
 				a := src
 				for a.Op == OCONVNOP {
 					a = a.Left
@@ -1596,14 +1598,24 @@ func esccall(e *EscState, n *Node, up *Node) {
 			// This occurs when function parameter type Isddd and n not Isddd
 			break
 		}
+
+		if note == uintptrEscapesTag {
+			escassignSinkNilWhy(e, src, src, "escaping uintptr")
+		}
+
 		t = it.Next()
 	}

+	// Store arguments into slice for ... arg.
 	for ; i < len(lls); i++ {
 		if Debug['m'] > 3 {
 			fmt.Printf("%v::esccall:: ... <- %v\n", linestr(lineno), Nconv(lls[i], FmtShort))
 		}
-		escassignNilWhy(e, src, lls[i], "arg to ...") // args to slice
+		if note == uintptrEscapesTag {
+			escassignSinkNilWhy(e, src, lls[i], "arg to uintptrescapes ...")
+		} else {
+			escassignNilWhy(e, src, lls[i], "arg to ...")
+		}
 	}
 }

@@ -1963,9 +1975,20 @@ recurse:
 // lets us take the address below to get a *string.
 var unsafeUintptrTag = "unsafe-uintptr"

+// This special tag is applied to uintptr parameters of functions
+// marked go:uintptrescapes.
+const uintptrEscapesTag = "uintptr-escapes"
+
 func esctag(e *EscState, func_ *Node) {
 	func_.Esc = EscFuncTagged

+	name := func(s *Sym, narg int) string {
+		if s != nil {
+			return s.Name
+		}
+		return fmt.Sprintf("arg#%d", narg)
+	}
+
 	// External functions are assumed unsafe,
 	// unless //go:noescape is given before the declaration.
 	if func_.Nbody.Len() == 0 {
@@ -1988,13 +2011,7 @@ func esctag(e *EscState, func_ *Node) {
 			narg++
 			if t.Type.Etype == TUINTPTR {
 				if Debug['m'] != 0 {
-					var name string
-					if t.Sym != nil {
-						name = t.Sym.Name
-					} else {
-						name = fmt.Sprintf("arg#%d", narg)
-					}
-					Warnl(func_.Lineno, "%v assuming %v is unsafe uintptr", funcSym(func_), name)
+					Warnl(func_.Lineno, "%v assuming %v is unsafe uintptr", funcSym(func_), name(t.Sym, narg))
 				}
 				t.Note = unsafeUintptrTag
 			}
@@ -2003,6 +2020,27 @@ func esctag(e *EscState, func_ *Node) {
 		return
 	}

+	if func_.Func.Pragma&UintptrEscapes != 0 {
+		narg := 0
+		for _, t := range func_.Type.Params().Fields().Slice() {
+			narg++
+			if t.Type.Etype == TUINTPTR {
+				if Debug['m'] != 0 {
+					Warnl(func_.Lineno, "%v marking %v as escaping uintptr", funcSym(func_), name(t.Sym, narg))
+				}
+				t.Note = uintptrEscapesTag
+			}
+
+			if t.Isddd && t.Type.Elem().Etype == TUINTPTR {
+				// final argument is ...uintptr.
+				if Debug['m'] != 0 {
+					Warnl(func_.Lineno, "%v marking %v as escaping ...uintptr", funcSym(func_), name(t.Sym, narg))
+				}
+				t.Note = uintptrEscapesTag
+			}
+		}
+	}
+
 	savefn := Curfn
 	Curfn = func_

@@ -2015,7 +2053,9 @@ func esctag(e *EscState, func_ *Node) {
 		case EscNone, // not touched by escflood
 			EscReturn:
 			if haspointers(ln.Type) { // don't bother tagging for scalars
-				ln.Name.Param.Field.Note = mktag(int(ln.Esc))
+				if ln.Name.Param.Field.Note != uintptrEscapesTag {
+					ln.Name.Param.Field.Note = mktag(int(ln.Esc))
+				}
 			}

 		case EscHeap, // touched by escflood, moved to heap
--- a/src/cmd/compile/internal/gc/export.go
+++ b/src/cmd/compile/internal/gc/export.go
@@ -479,6 +479,10 @@ func pkgtype(s *Sym) *Type {
 	return s.Def.Type
 }

+// numImport tracks how often a package with a given name is imported.
+// It is used to provide a better error message (by using the package
+// path to disambiguate) if a package that appears multiple times with
+// the same name appears in an error message.
 var numImport = make(map[string]int)

 func importimport(s *Sym, path string) {
--- a/src/cmd/compile/internal/gc/float_test.go
+++ b/src/cmd/compile/internal/gc/float_test.go
@@ -74,6 +74,27 @@ func cvt8(a float32) int32 {
 	return int32(a)
 }

+// make sure to cover int, uint cases (issue #16738)
+//go:noinline
+func cvt9(a float64) int {
+	return int(a)
+}
+
+//go:noinline
+func cvt10(a float64) uint {
+	return uint(a)
+}
+
+//go:noinline
+func cvt11(a float32) int {
+	return int(a)
+}
+
+//go:noinline
+func cvt12(a float32) uint {
+	return uint(a)
+}
+
 func TestFloatConvert(t *testing.T) {
 	if got := cvt1(3.5); got != 3 {
 		t.Errorf("cvt1 got %d, wanted 3", got)
@@ -99,4 +120,16 @@ func TestFloatConvert(t *testing.T) {
 	if got := cvt8(3.5); got != 3 {
 		t.Errorf("cvt8 got %d, wanted 3", got)
 	}
+	if got := cvt9(3.5); got != 3 {
+		t.Errorf("cvt9 got %d, wanted 3", got)
+	}
+	if got := cvt10(3.5); got != 3 {
+		t.Errorf("cvt10 got %d, wanted 3", got)
+	}
+	if got := cvt11(3.5); got != 3 {
+		t.Errorf("cvt11 got %d, wanted 3", got)
+	}
+	if got := cvt12(3.5); got != 3 {
+		t.Errorf("cvt12 got %d, wanted 3", got)
+	}
 }
--- a/src/cmd/compile/internal/gc/gen.go
+++ b/src/cmd/compile/internal/gc/gen.go
@@ -160,6 +160,14 @@ func moveToHeap(n *Node) {
 		if n.Class == PPARAM {
 			stackcopy.SetNotLiveAtEnd(true)
 		}
+		if n.Class == PPARAMOUT {
+			// Make sure the pointer to the heap copy is kept live throughout the function.
+			// The function could panic at any point, and then a defer could recover.
+			// Thus, we need the pointer to the heap copy always available so the
+			// post-deferreturn code can copy the return value back to the stack.
+			// See issue 16095.
+			heapaddr.setIsOutputParamHeapAddr(true)
+		}
 		n.Name.Param.Stackcopy = stackcopy

 		// Substitute the stackcopy into the function variable list so that
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@@ -156,8 +156,6 @@ var Debug_typeassert int

 var localpkg *Pkg // package being compiled

-var autopkg *Pkg // fake package for allocating auto variables
-
 var importpkg *Pkg // package being imported

 var itabpkg *Pkg // fake pkg for itab entries
--- a/src/cmd/compile/internal/gc/inl.go
+++ b/src/cmd/compile/internal/gc/inl.go
@@ -766,7 +766,9 @@ func mkinlcall1(n *Node, fn *Node, isddd bool) *Node {
 		ninit.Append(as)
 	}

-	retlabel := newlabel_inl()
+	retlabel := autolabel(".i")
+	retlabel.Etype = 1 // flag 'safe' for escape analysis (no backjumps)
+
 	inlgen++

 	subst := inlsubst{
@@ -876,15 +878,6 @@ func argvar(t *Type, i int) *Node {
 	return n
 }

-var newlabel_inl_label int
-
-func newlabel_inl() *Node {
-	newlabel_inl_label++
-	n := newname(LookupN(".inlret", newlabel_inl_label))
-	n.Etype = 1 // flag 'safe' for escape analysis (no backjumps)
-	return n
-}
-
 // The inlsubst type implements the actual inlining of a single
 // function call.
 type inlsubst struct {
--- a/src/cmd/compile/internal/gc/lex.go
+++ b/src/cmd/compile/internal/gc/lex.go
@@ -72,6 +72,7 @@ const (
 	Nowritebarrier           // emit compiler error instead of write barrier
 	Nowritebarrierrec        // error on write barrier in this or recursive callees
 	CgoUnsafeArgs            // treat a pointer to one arg as a pointer to them all
+	UintptrEscapes           // pointers converted to uintptr escape
 )

 type lexer struct {
@@ -930,6 +931,19 @@ func (l *lexer) getlinepragma() rune {
 			l.pragma |= Nowritebarrierrec | Nowritebarrier // implies Nowritebarrier
 		case "go:cgo_unsafe_args":
 			l.pragma |= CgoUnsafeArgs
+		case "go:uintptrescapes":
+			// For the next function declared in the file
+			// any uintptr arguments may be pointer values
+			// converted to uintptr. This directive
+			// ensures that the referenced allocated
+			// object, if any, is retained and not moved
+			// until the call completes, even though from
+			// the types alone it would appear that the
+			// object is no longer needed during the
+			// call. The conversion to uintptr must appear
+			// in the argument list.
+			// Used in syscall/dll_windows.go.
+			l.pragma |= UintptrEscapes
 		}
 		return c
 	}
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@@ -29,6 +29,8 @@ var (
 	goarch  string
 	goroot  string
 	buildid string
+
+	flag_newparser bool
 )

 var (
@@ -108,8 +110,6 @@ func Main() {

 	localpkg = mkpkg("")
 	localpkg.Prefix = "\"\""
-	autopkg = mkpkg("")
-	autopkg.Prefix = "\"\""

 	// pseudo-package, for scoping
 	builtinpkg = mkpkg("go.builtin")
@@ -184,6 +184,7 @@ func Main() {
 	obj.Flagcount("live", "debug liveness analysis", &debuglive)
 	obj.Flagcount("m", "print optimization decisions", &Debug['m'])
 	flag.BoolVar(&flag_msan, "msan", false, "build code compatible with C/C++ memory sanitizer")
+	flag.BoolVar(&flag_newparser, "newparser", false, "use new parser")
 	flag.BoolVar(&newexport, "newexport", true, "use new export format") // TODO(gri) remove eventually (issue 15323)
 	flag.BoolVar(&nolocalimports, "nolocalimports", false, "reject local (relative) imports")
 	flag.StringVar(&outfile, "o", "", "write output to `file`")
@@ -313,25 +314,14 @@ func Main() {
 		}

 		linehistpush(infile)
-
-		f, err := os.Open(infile)
-		if err != nil {
-			fmt.Printf("open %s: %v\n", infile, err)
-			errorexit()
-		}
-		bin := bufio.NewReader(f)
-
-		// Skip initial BOM if present.
-		if r, _, _ := bin.ReadRune(); r != BOM {
-			bin.UnreadRune()
-		}
-
 		block = 1
 		iota_ = -1000000
-
 		imported_unsafe = false
-
-		parse_file(bin)
+		if flag_newparser {
+			parseFile(infile)
+		} else {
+			oldParseFile(infile)
+		}
 		if nsyntaxerrors != 0 {
 			errorexit()
 		}
@@ -340,9 +330,7 @@ func Main() {
 		// for the line history to work, and which then has to be corrected elsewhere,
 		// just add a line here.
 		lexlineno++
-
 		linehistpop()
-		f.Close()
 	}

 	testdclstack()
--- a/src/cmd/compile/internal/gc/noder.go
+++ b/src/cmd/compile/internal/gc/noder.go
--- a/src/cmd/compile/internal/gc/order.go
+++ b/src/cmd/compile/internal/gc/order.go
@@ -373,7 +373,7 @@ func ordercall(n *Node, order *Order) {
 			if t == nil {
 				break
 			}
-			if t.Note == unsafeUintptrTag {
+			if t.Note == unsafeUintptrTag || t.Note == uintptrEscapesTag {
 				xp := n.List.Addr(i)
 				for (*xp).Op == OCONVNOP && !(*xp).Type.IsPtr() {
 					xp = &(*xp).Left
@@ -385,7 +385,11 @@ func ordercall(n *Node, order *Order) {
 					*xp = x
 				}
 			}
-			t = it.Next()
+			next := it.Next()
+			if next == nil && t.Isddd && t.Note == uintptrEscapesTag {
+				next = t
+			}
+			t = next
 		}
 	}
 }
--- a/src/cmd/compile/internal/gc/parser.go
+++ b/src/cmd/compile/internal/gc/parser.go
@@ -15,6 +15,7 @@ package gc
 import (
 	"bufio"
 	"fmt"
+	"os"
 	"strconv"
 	"strings"
 )
@@ -26,8 +27,20 @@ func parse_import(bin *bufio.Reader, indent []byte) {
 	newparser(bin, indent).import_package()
 }

-// parse_file parses a single Go source file.
-func parse_file(bin *bufio.Reader) {
+// oldParseFile parses a single Go source file.
+func oldParseFile(infile string) {
+	f, err := os.Open(infile)
+	if err != nil {
+		fmt.Printf("open %s: %v\n", infile, err)
+		errorexit()
+	}
+	defer f.Close()
+	bin := bufio.NewReader(f)
+
+	// Skip initial BOM if present.
+	if r, _, _ := bin.ReadRune(); r != BOM {
+		bin.UnreadRune()
+	}
 	newparser(bin, nil).file()
 }

@@ -2006,7 +2019,8 @@ func (p *parser) hidden_fndcl() *Node {
 		ss.Type = functype(s2[0], s6, s8)

 		checkwidth(ss.Type)
-		addmethod(s4, ss.Type, p.structpkg, false, false)
+		addmethod(s4, ss.Type, p.structpkg, false, p.pragma&Nointerface != 0)
+		p.pragma = 0
 		funchdr(ss)

 		// inl.C's inlnode in on a dotmeth node expects to find the inlineable body as
--- a/src/cmd/compile/internal/gc/plive.go
+++ b/src/cmd/compile/internal/gc/plive.go
@@ -1175,6 +1175,19 @@ func livenessepilogue(lv *Liveness) {
 	all := bvalloc(nvars)
 	ambig := bvalloc(localswords())

+	// Set ambig bit for the pointers to heap-allocated pparamout variables.
+	// These are implicitly read by post-deferreturn code and thus must be
+	// kept live throughout the function (if there is any defer that recovers).
+	if hasdefer {
+		for _, n := range lv.vars {
+			if n.IsOutputParamHeapAddr() {
+				n.Name.Needzero = true
+				xoffset := n.Xoffset + stkptrsize
+				onebitwalktype1(n.Type, &xoffset, ambig)
+			}
+		}
+	}
+
 	for _, bb := range lv.cfg {
 		// Compute avarinitany and avarinitall for entry to block.
 		// This duplicates information known during livenesssolve
--- a/src/cmd/compile/internal/gc/reflect.go
+++ b/src/cmd/compile/internal/gc/reflect.go
@@ -799,6 +799,7 @@ func typeptrdata(t *Type) int64 {
 const (
 	tflagUncommon  = 1 << 0
 	tflagExtraStar = 1 << 1
+	tflagNamed     = 1 << 2
 )

 var dcommontype_algarray *Sym
@@ -820,14 +821,10 @@ func dcommontype(s *Sym, ot int, t *Type) int {
 		algsym = dalgsym(t)
 	}

+	var sptr *Sym
 	tptr := Ptrto(t)
 	if !t.IsPtr() && (t.Sym != nil || methods(tptr) != nil) {
-		sptr := dtypesym(tptr)
-		r := obj.Addrel(Linksym(s))
-		r.Off = 0
-		r.Siz = 0
-		r.Sym = sptr.Lsym
-		r.Type = obj.R_USETYPE
+		sptr = dtypesym(tptr)
 	}

 	gcsym, useGCProg, ptrdata := dgcsym(t)
@@ -845,7 +842,7 @@ func dcommontype(s *Sym, ot int, t *Type) int {
 	//		alg           *typeAlg
 	//		gcdata        *byte
 	//		str           nameOff
-	//		_             int32
+	//		ptrToThis     typeOff
 	//	}
 	ot = duintptr(s, ot, uint64(t.Width))
 	ot = duintptr(s, ot, uint64(ptrdata))
@@ -856,6 +853,9 @@ func dcommontype(s *Sym, ot int, t *Type) int {
 	if uncommonSize(t) != 0 {
 		tflag |= tflagUncommon
 	}
+	if t.Sym != nil && t.Sym.Name != "" {
+		tflag |= tflagNamed
+	}

 	exported := false
 	p := Tconv(t, FmtLeft|FmtUnsigned)
@@ -909,8 +909,12 @@ func dcommontype(s *Sym, ot int, t *Type) int {
 	ot = dsymptr(s, ot, gcsym, 0) // gcdata

 	nsym := dname(p, "", nil, exported)
-	ot = dsymptrOffLSym(Linksym(s), ot, nsym, 0)
-	ot = duint32(s, ot, 0)
+	ot = dsymptrOffLSym(Linksym(s), ot, nsym, 0) // str
+	if sptr == nil {
+		ot = duint32(s, ot, 0)
+	} else {
+		ot = dsymptrOffLSym(Linksym(s), ot, Linksym(sptr), 0) // ptrToThis
+	}

 	return ot
 }
--- a/src/cmd/compile/internal/gc/sinit.go
+++ b/src/cmd/compile/internal/gc/sinit.go
@@ -866,7 +866,7 @@ func maplit(ctxt int, n *Node, var_ *Node, init *Nodes) {
 	nerr := nerrors

 	a := Nod(OMAKE, nil, nil)
-	a.List.Set1(typenod(n.Type))
+	a.List.Set2(typenod(n.Type), Nodintconst(int64(len(n.List.Slice()))))
 	litas(var_, a, init)

 	// count the initializers
--- a/src/cmd/compile/internal/gc/sizeof_test.go
+++ b/src/cmd/compile/internal/gc/sizeof_test.go
@@ -23,7 +23,7 @@ func TestSizeof(t *testing.T) {
 		_64bit uintptr     // size on 64bit platforms
 	}{
 		{Flow{}, 52, 88},
-		{Func{}, 92, 160},
+		{Func{}, 96, 168},
 		{Name{}, 52, 80},
 		{Node{}, 92, 144},
 		{Sym{}, 60, 112},
--- a/src/cmd/compile/internal/gc/sparselocatephifunctions.go
+++ b/src/cmd/compile/internal/gc/sparselocatephifunctions.go
@@ -153,10 +153,13 @@ func (s *state) locatePotentialPhiFunctions(fn *Node) *sparseDefState {
 					p := e.Block()
 					dm.Use(t, p)                                // always count phi pred as "use"; no-op except for loop edges, which matter.
 					x := t.stm.Find(p, ssa.AdjustAfter, helper) // Look for defs reaching or within predecessors.
+					if x == nil {                               // nil def from a predecessor means a backedge that will be visited soon.
+						continue
+					}
 					if defseen == nil {
 						defseen = x
 					}
-					if defseen != x || x == nil { // TODO: too conservative at loops, does better if x == nil -> continue
+					if defseen != x {
 						// Need to insert a phi function here because predecessors's definitions differ.
 						change = true
 						// Phi insertion is at AdjustBefore, visible with find in same block at AdjustWithin or AdjustAfter.
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -26,6 +26,9 @@ func initssa() *ssa.Config {
 	ssaExp.mustImplement = true
 	if ssaConfig == nil {
 		ssaConfig = ssa.NewConfig(Thearch.LinkArch.Name, &ssaExp, Ctxt, Debug['N'] == 0)
+		if Thearch.LinkArch.Name == "386" {
+			ssaConfig.Set387(Thearch.Use387)
+		}
 	}
 	return ssaConfig
 }
@@ -37,8 +40,8 @@ func shouldssa(fn *Node) bool {
 		if os.Getenv("SSATEST") == "" {
 			return false
 		}
+	case "amd64", "amd64p32", "arm", "386", "arm64":
 		// Generally available.
-	case "amd64":
 	}
 	if !ssaEnabled {
 		return false
@@ -384,6 +387,14 @@ func (s *state) endBlock() *ssa.Block {

 // pushLine pushes a line number on the line number stack.
 func (s *state) pushLine(line int32) {
+	if line == 0 {
+		// the frontend may emit node with line number missing,
+		// use the parent line number in this case.
+		line = s.peekLine()
+		if Debug['K'] != 0 {
+			Warn("buildssa: line 0")
+		}
+	}
 	s.line = append(s.line, line)
 }

@@ -1138,6 +1149,7 @@ var opToSSA = map[opAndType]ssa.Op{
 	opAndType{OEQ, TFUNC}:      ssa.OpEqPtr,
 	opAndType{OEQ, TMAP}:       ssa.OpEqPtr,
 	opAndType{OEQ, TCHAN}:      ssa.OpEqPtr,
+	opAndType{OEQ, TPTR32}:     ssa.OpEqPtr,
 	opAndType{OEQ, TPTR64}:     ssa.OpEqPtr,
 	opAndType{OEQ, TUINTPTR}:   ssa.OpEqPtr,
 	opAndType{OEQ, TUNSAFEPTR}: ssa.OpEqPtr,
@@ -1158,6 +1170,7 @@ var opToSSA = map[opAndType]ssa.Op{
 	opAndType{ONE, TFUNC}:      ssa.OpNeqPtr,
 	opAndType{ONE, TMAP}:       ssa.OpNeqPtr,
 	opAndType{ONE, TCHAN}:      ssa.OpNeqPtr,
+	opAndType{ONE, TPTR32}:     ssa.OpNeqPtr,
 	opAndType{ONE, TPTR64}:     ssa.OpNeqPtr,
 	opAndType{ONE, TUINTPTR}:   ssa.OpNeqPtr,
 	opAndType{ONE, TUNSAFEPTR}: ssa.OpNeqPtr,
@@ -1322,6 +1335,15 @@ var fpConvOpToSSA = map[twoTypes]twoOpsAndType{
 	twoTypes{TFLOAT32, TFLOAT64}: twoOpsAndType{ssa.OpCvt32Fto64F, ssa.OpCopy, TFLOAT64},
 }

+// this map is used only for 32-bit arch, and only includes the difference
+// on 32-bit arch, don't use int64<->float conversion for uint32
+var fpConvOpToSSA32 = map[twoTypes]twoOpsAndType{
+	twoTypes{TUINT32, TFLOAT32}: twoOpsAndType{ssa.OpCopy, ssa.OpCvt32Uto32F, TUINT32},
+	twoTypes{TUINT32, TFLOAT64}: twoOpsAndType{ssa.OpCopy, ssa.OpCvt32Uto64F, TUINT32},
+	twoTypes{TFLOAT32, TUINT32}: twoOpsAndType{ssa.OpCvt32Fto32U, ssa.OpCopy, TUINT32},
+	twoTypes{TFLOAT64, TUINT32}: twoOpsAndType{ssa.OpCvt64Fto32U, ssa.OpCopy, TUINT32},
+}
+
 var shiftOpToSSA = map[opAndTwoTypes]ssa.Op{
 	opAndTwoTypes{OLSH, TINT8, TUINT8}:   ssa.OpLsh8x8,
 	opAndTwoTypes{OLSH, TUINT8, TUINT8}:  ssa.OpLsh8x8,
@@ -1638,6 +1660,11 @@ func (s *state) expr(n *Node) *ssa.Value {

 		if ft.IsFloat() || tt.IsFloat() {
 			conv, ok := fpConvOpToSSA[twoTypes{s.concreteEtype(ft), s.concreteEtype(tt)}]
+			if s.config.IntSize == 4 && Thearch.LinkArch.Name != "amd64p32" {
+				if conv1, ok1 := fpConvOpToSSA32[twoTypes{s.concreteEtype(ft), s.concreteEtype(tt)}]; ok1 {
+					conv = conv1
+				}
+			}
 			if !ok {
 				s.Fatalf("weird float conversion %s -> %s", ft, tt)
 			}
@@ -1731,7 +1758,7 @@ func (s *state) expr(n *Node) *ssa.Value {
 			addop := ssa.OpAdd64F
 			subop := ssa.OpSub64F
 			pt := floatForComplex(n.Type) // Could be Float32 or Float64
-			wt := Types[TFLOAT64]         // Compute in Float64 to minimize cancellation error
+			wt := Types[TFLOAT64]         // Compute in Float64 to minimize cancelation error

 			areal := s.newValue1(ssa.OpComplexReal, pt, a)
 			breal := s.newValue1(ssa.OpComplexReal, pt, b)
@@ -1769,7 +1796,7 @@ func (s *state) expr(n *Node) *ssa.Value {
 			subop := ssa.OpSub64F
 			divop := ssa.OpDiv64F
 			pt := floatForComplex(n.Type) // Could be Float32 or Float64
-			wt := Types[TFLOAT64]         // Compute in Float64 to minimize cancellation error
+			wt := Types[TFLOAT64]         // Compute in Float64 to minimize cancelation error

 			areal := s.newValue1(ssa.OpComplexReal, pt, a)
 			breal := s.newValue1(ssa.OpComplexReal, pt, b)
@@ -1946,7 +1973,7 @@ func (s *state) expr(n *Node) *ssa.Value {
 		case n.Left.Type.IsString():
 			a := s.expr(n.Left)
 			i := s.expr(n.Right)
-			i = s.extendIndex(i)
+			i = s.extendIndex(i, Panicindex)
 			if !n.Bounded {
 				len := s.newValue1(ssa.OpStringLen, Types[TINT], a)
 				s.boundsCheck(i, len)
@@ -2035,13 +2062,13 @@ func (s *state) expr(n *Node) *ssa.Value {
 		var i, j, k *ssa.Value
 		low, high, max := n.SliceBounds()
 		if low != nil {
-			i = s.extendIndex(s.expr(low))
+			i = s.extendIndex(s.expr(low), panicslice)
 		}
 		if high != nil {
-			j = s.extendIndex(s.expr(high))
+			j = s.extendIndex(s.expr(high), panicslice)
 		}
 		if max != nil {
-			k = s.extendIndex(s.expr(max))
+			k = s.extendIndex(s.expr(max), panicslice)
 		}
 		p, l, c := s.slice(n.Left.Type, v, i, j, k)
 		return s.newValue3(ssa.OpSliceMake, n.Type, p, l, c)
@@ -2051,10 +2078,10 @@ func (s *state) expr(n *Node) *ssa.Value {
 		var i, j *ssa.Value
 		low, high, _ := n.SliceBounds()
 		if low != nil {
-			i = s.extendIndex(s.expr(low))
+			i = s.extendIndex(s.expr(low), panicslice)
 		}
 		if high != nil {
-			j = s.extendIndex(s.expr(high))
+			j = s.extendIndex(s.expr(high), panicslice)
 		}
 		p, l, _ := s.slice(n.Left.Type, v, i, j, nil)
 		return s.newValue2(ssa.OpStringMake, n.Type, p, l)
@@ -2238,7 +2265,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
 			if haspointers(et) {
 				s.insertWBmove(et, addr, arg.v, n.Lineno, arg.isVolatile)
 			} else {
-				s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, et.Size(), addr, arg.v, s.mem())
+				s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(et), addr, arg.v, s.mem())
 			}
 		}
 	}
@@ -2371,14 +2398,14 @@ func (s *state) assign(left *Node, right *ssa.Value, wb, deref bool, line int32,
 	if deref {
 		// Treat as a mem->mem move.
 		if right == nil {
-			s.vars[&memVar] = s.newValue2I(ssa.OpZero, ssa.TypeMem, t.Size(), addr, s.mem())
+			s.vars[&memVar] = s.newValue2I(ssa.OpZero, ssa.TypeMem, SizeAlignAuxInt(t), addr, s.mem())
 			return
 		}
 		if wb {
 			s.insertWBmove(t, addr, right, line, rightIsVolatile)
 			return
 		}
-		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, t.Size(), addr, right, s.mem())
+		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(t), addr, right, s.mem())
 		return
 	}
 	// Treat as a store.
@@ -2577,7 +2604,7 @@ func (s *state) call(n *Node, k callKind) *ssa.Value {
 			s.nilCheck(itab)
 		}
 		itabidx := fn.Xoffset + 3*int64(Widthptr) + 8 // offset of fun field in runtime.itab
-		itab = s.newValue1I(ssa.OpOffPtr, Types[TUINTPTR], itabidx, itab)
+		itab = s.newValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), itabidx, itab)
 		if k == callNormal {
 			codeptr = s.newValue2(ssa.OpLoad, Types[TUINTPTR], itab, s.mem())
 		} else {
@@ -2600,16 +2627,18 @@ func (s *state) call(n *Node, k callKind) *ssa.Value {
 		if k != callNormal {
 			argStart += int64(2 * Widthptr)
 		}
-		addr := s.entryNewValue1I(ssa.OpOffPtr, Types[TUINTPTR], argStart, s.sp)
+		addr := s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), argStart, s.sp)
 		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, int64(Widthptr), addr, rcvr, s.mem())
 	}

 	// Defer/go args
 	if k != callNormal {
 		// Write argsize and closure (args to Newproc/Deferproc).
+		argStart := Ctxt.FixedFrameSize()
 		argsize := s.constInt32(Types[TUINT32], int32(stksize))
-		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, 4, s.sp, argsize, s.mem())
-		addr := s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), int64(Widthptr), s.sp)
+		addr := s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINT32]), argStart, s.sp)
+		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, 4, addr, argsize, s.mem())
+		addr = s.entryNewValue1I(ssa.OpOffPtr, Ptrto(Types[TUINTPTR]), argStart+int64(Widthptr), s.sp)
 		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, int64(Widthptr), addr, closure, s.mem())
 		stksize += 2 * int64(Widthptr)
 	}
@@ -2754,7 +2783,7 @@ func (s *state) addr(n *Node, bounded bool) (*ssa.Value, bool) {
 		if n.Left.Type.IsSlice() {
 			a := s.expr(n.Left)
 			i := s.expr(n.Right)
-			i = s.extendIndex(i)
+			i = s.extendIndex(i, Panicindex)
 			len := s.newValue1(ssa.OpSliceLen, Types[TINT], a)
 			if !n.Bounded {
 				s.boundsCheck(i, len)
@@ -2764,7 +2793,7 @@ func (s *state) addr(n *Node, bounded bool) (*ssa.Value, bool) {
 		} else { // array
 			a, isVolatile := s.addr(n.Left, bounded)
 			i := s.expr(n.Right)
-			i = s.extendIndex(i)
+			i = s.extendIndex(i, Panicindex)
 			len := s.constInt(Types[TINT], n.Left.Type.NumElem())
 			if !n.Bounded {
 				s.boundsCheck(i, len)
@@ -2905,12 +2934,11 @@ func (s *state) nilCheck(ptr *ssa.Value) {

 // boundsCheck generates bounds checking code. Checks if 0 <= idx < len, branches to exit if not.
 // Starts a new block on return.
+// idx is already converted to full int width.
 func (s *state) boundsCheck(idx, len *ssa.Value) {
 	if Debug['B'] != 0 {
 		return
 	}
-	// TODO: convert index to full width?
-	// TODO: if index is 64-bit and we're compiling to 32-bit, check that high 32 bits are zero.

 	// bounds check
 	cmp := s.newValue2(ssa.OpIsInBounds, Types[TBOOL], idx, len)
@@ -2919,19 +2947,18 @@ func (s *state) boundsCheck(idx, len *ssa.Value) {

 // sliceBoundsCheck generates slice bounds checking code. Checks if 0 <= idx <= len, branches to exit if not.
 // Starts a new block on return.
+// idx and len are already converted to full int width.
 func (s *state) sliceBoundsCheck(idx, len *ssa.Value) {
 	if Debug['B'] != 0 {
 		return
 	}
-	// TODO: convert index to full width?
-	// TODO: if index is 64-bit and we're compiling to 32-bit, check that high 32 bits are zero.

 	// bounds check
 	cmp := s.newValue2(ssa.OpIsSliceInBounds, Types[TBOOL], idx, len)
 	s.check(cmp, panicslice)
 }

-// If cmp (a bool) is true, panic using the given function.
+// If cmp (a bool) is false, panic using the given function.
 func (s *state) check(cmp *ssa.Value, fn *Node) {
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
@@ -2961,19 +2988,23 @@ func (s *state) check(cmp *ssa.Value, fn *Node) {
 // is started to load the return values.
 func (s *state) rtcall(fn *Node, returns bool, results []*Type, args ...*ssa.Value) []*ssa.Value {
 	// Write args to the stack
-	var off int64 // TODO: arch-dependent starting offset?
+	off := Ctxt.FixedFrameSize()
 	for _, arg := range args {
 		t := arg.Type
 		off = Rnd(off, t.Alignment())
 		ptr := s.sp
 		if off != 0 {
-			ptr = s.newValue1I(ssa.OpOffPtr, Types[TUINTPTR], off, s.sp)
+			ptr = s.newValue1I(ssa.OpOffPtr, t.PtrTo(), off, s.sp)
 		}
 		size := t.Size()
 		s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, size, ptr, arg, s.mem())
 		off += size
 	}
 	off = Rnd(off, int64(Widthptr))
+	if Thearch.LinkArch.Name == "amd64p32" {
+		// amd64p32 wants 8-byte alignment of the start of the return values.
+		off = Rnd(off, 8)
+	}

 	// Issue call
 	call := s.newValue1A(ssa.OpStaticCall, ssa.TypeMem, fn.Sym, s.mem())
@@ -2984,7 +3015,7 @@ func (s *state) rtcall(fn *Node, returns bool, results []*Type, args ...*ssa.Val
 	if !returns {
 		b.Kind = ssa.BlockExit
 		b.SetControl(call)
-		call.AuxInt = off
+		call.AuxInt = off - Ctxt.FixedFrameSize()
 		if len(results) > 0 {
 			Fatalf("panic call can't have results")
 		}
@@ -3007,7 +3038,7 @@ func (s *state) rtcall(fn *Node, returns bool, results []*Type, args ...*ssa.Val
 		off = Rnd(off, t.Alignment())
 		ptr := s.sp
 		if off != 0 {
-			ptr = s.newValue1I(ssa.OpOffPtr, Types[TUINTPTR], off, s.sp)
+			ptr = s.newValue1I(ssa.OpOffPtr, Ptrto(t), off, s.sp)
 		}
 		res[i] = s.newValue2(ssa.OpLoad, t, ptr, s.mem())
 		off += t.Size()
@@ -3041,10 +3072,9 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, line int32, rightI

 	aux := &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: syslook("writeBarrier").Sym}
 	flagaddr := s.newValue1A(ssa.OpAddr, Ptrto(Types[TUINT32]), aux, s.sb)
-	// TODO: select the .enabled field. It is currently first, so not needed for now.
-	// Load word, test byte, avoiding partial register write from load byte.
+	// Load word, test word, avoiding partial register write from load byte.
 	flag := s.newValue2(ssa.OpLoad, Types[TUINT32], flagaddr, s.mem())
-	flag = s.newValue1(ssa.OpTrunc64to8, Types[TBOOL], flag)
+	flag = s.newValue2(ssa.OpNeq32, Types[TBOOL], flag, s.constInt32(Types[TUINT32], 0))
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
 	b.Likely = ssa.BranchUnlikely
@@ -3065,7 +3095,7 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, line int32, rightI
 		tmp := temp(t)
 		s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, ssa.TypeMem, tmp, s.mem())
 		tmpaddr, _ := s.addr(tmp, true)
-		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, t.Size(), tmpaddr, right, s.mem())
+		s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(t), tmpaddr, right, s.mem())
 		// Issue typedmemmove call.
 		taddr := s.newValue1A(ssa.OpAddr, Types[TUINTPTR], &ssa.ExternSymbol{Typ: Types[TUINTPTR], Sym: typenamesym(t)}, s.sb)
 		s.rtcall(typedmemmove, true, nil, taddr, left, tmpaddr)
@@ -3075,7 +3105,7 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, line int32, rightI
 	s.endBlock().AddEdgeTo(bEnd)

 	s.startBlock(bElse)
-	s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, t.Size(), left, right, s.mem())
+	s.vars[&memVar] = s.newValue3I(ssa.OpMove, ssa.TypeMem, SizeAlignAuxInt(t), left, right, s.mem())
 	s.endBlock().AddEdgeTo(bEnd)

 	s.startBlock(bEnd)
@@ -3109,10 +3139,9 @@ func (s *state) insertWBstore(t *Type, left, right *ssa.Value, line int32, skip

 	aux := &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: syslook("writeBarrier").Sym}
 	flagaddr := s.newValue1A(ssa.OpAddr, Ptrto(Types[TUINT32]), aux, s.sb)
-	// TODO: select the .enabled field. It is currently first, so not needed for now.
-	// Load word, test byte, avoiding partial register write from load byte.
+	// Load word, test word, avoiding partial register write from load byte.
 	flag := s.newValue2(ssa.OpLoad, Types[TUINT32], flagaddr, s.mem())
-	flag = s.newValue1(ssa.OpTrunc64to8, Types[TBOOL], flag)
+	flag = s.newValue2(ssa.OpNeq32, Types[TBOOL], flag, s.constInt32(Types[TUINT32], 0))
 	b := s.endBlock()
 	b.Kind = ssa.BlockIf
 	b.Likely = ssa.BranchUnlikely
@@ -3922,6 +3951,11 @@ type SSAGenState struct {

 	// bstart remembers where each block starts (indexed by block ID)
 	bstart []*obj.Prog
+
+	// 387 port: maps from SSE registers (REG_X?) to 387 registers (REG_F?)
+	SSEto387 map[int16]int16
+	// Some architectures require a 64-bit temporary for FP-related register shuffling. Examples include x86-387, PPC, and Sparc V8.
+	ScratchFpMem *Node
 }

 // Pc returns the current Prog.
@@ -3958,6 +3992,13 @@ func genssa(f *ssa.Func, ptxt *obj.Prog, gcargs, gclocals *Sym) {
 		blockProgs[Pc] = f.Blocks[0]
 	}

+	if Thearch.Use387 {
+		s.SSEto387 = map[int16]int16{}
+	}
+	if f.Config.NeedsFpScratch {
+		s.ScratchFpMem = temp(Types[TUINT64])
+	}
+
 	// Emit basic blocks
 	for i, b := range f.Blocks {
 		s.bstart[b.ID] = Pc
@@ -4120,12 +4161,25 @@ func SSAGenFPJump(s *SSAGenState, b, next *ssa.Block, jumps *[2][2]FloatingEQNEJ
 	}
 }

+func AuxOffset(v *ssa.Value) (offset int64) {
+	if v.Aux == nil {
+		return 0
+	}
+	switch sym := v.Aux.(type) {
+
+	case *ssa.AutoSymbol:
+		n := sym.Node.(*Node)
+		return n.Xoffset
+	}
+	return 0
+}
+
 // AddAux adds the offset in the aux fields (AuxInt and Aux) of v to a.
 func AddAux(a *obj.Addr, v *ssa.Value) {
 	AddAux2(a, v, v.AuxInt)
 }
 func AddAux2(a *obj.Addr, v *ssa.Value, offset int64) {
-	if a.Type != obj.TYPE_MEM {
+	if a.Type != obj.TYPE_MEM && a.Type != obj.TYPE_ADDR {
 		v.Fatalf("bad AddAux addr %v", a)
 	}
 	// add integer offset
@@ -4163,17 +4217,27 @@ func AddAux2(a *obj.Addr, v *ssa.Value, offset int64) {
 	}
 }

+// SizeAlignAuxInt returns an AuxInt encoding the size and alignment of type t.
+func SizeAlignAuxInt(t *Type) int64 {
+	return ssa.MakeSizeAndAlign(t.Size(), t.Alignment()).Int64()
+}
+
 // extendIndex extends v to a full int width.
-func (s *state) extendIndex(v *ssa.Value) *ssa.Value {
+// panic using the given function if v does not fit in an int (only on 32-bit archs).
+func (s *state) extendIndex(v *ssa.Value, panicfn *Node) *ssa.Value {
 	size := v.Type.Size()
 	if size == s.config.IntSize {
 		return v
 	}
 	if size > s.config.IntSize {
-		// TODO: truncate 64-bit indexes on 32-bit pointer archs. We'd need to test
-		// the high word and branch to out-of-bounds failure if it is not 0.
-		s.Unimplementedf("64->32 index truncation not implemented")
-		return v
+		// truncate 64-bit indexes on 32-bit pointer archs. Test the
+		// high word and branch to out-of-bounds failure if it is not 0.
+		if Debug['B'] == 0 {
+			hi := s.newValue1(ssa.OpInt64Hi, Types[TUINT32], v)
+			cmp := s.newValue2(ssa.OpEq32, Types[TBOOL], hi, s.constInt32(Types[TUINT32], 0))
+			s.check(cmp, panicfn)
+		}
+		return s.newValue1(ssa.OpTrunc64to32, Types[TINT], v)
 	}

 	// Extend value to the required size
@@ -4212,17 +4276,74 @@ func (s *state) extendIndex(v *ssa.Value) *ssa.Value {
 	return s.newValue1(op, Types[TINT], v)
 }

-// SSARegNum returns the register (in cmd/internal/obj numbering) to
-// which v has been allocated. Panics if v is not assigned to a
-// register.
-// TODO: Make this panic again once it stops happening routinely.
-func SSARegNum(v *ssa.Value) int16 {
+// SSAReg returns the register to which v has been allocated.
+func SSAReg(v *ssa.Value) *ssa.Register {
 	reg := v.Block.Func.RegAlloc[v.ID]
 	if reg == nil {
-		v.Unimplementedf("nil regnum for value: %s\n%s\n", v.LongString(), v.Block.Func)
-		return 0
+		v.Fatalf("nil register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSAReg0 returns the register to which the first output of v has been allocated.
+func SSAReg0(v *ssa.Value) *ssa.Register {
+	reg := v.Block.Func.RegAlloc[v.ID].(ssa.LocPair)[0]
+	if reg == nil {
+		v.Fatalf("nil first register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSAReg1 returns the register to which the second output of v has been allocated.
+func SSAReg1(v *ssa.Value) *ssa.Register {
+	reg := v.Block.Func.RegAlloc[v.ID].(ssa.LocPair)[1]
+	if reg == nil {
+		v.Fatalf("nil second register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSARegNum returns the register number (in cmd/internal/obj numbering) to which v has been allocated.
+func SSARegNum(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg(v).Num]
+}
+
+// SSARegNum0 returns the register number (in cmd/internal/obj numbering) to which the first output of v has been allocated.
+func SSARegNum0(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg0(v).Num]
+}
+
+// SSARegNum1 returns the register number (in cmd/internal/obj numbering) to which the second output of v has been allocated.
+func SSARegNum1(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg1(v).Num]
+}
+
+// CheckLoweredPhi checks that regalloc and stackalloc correctly handled phi values.
+// Called during ssaGenValue.
+func CheckLoweredPhi(v *ssa.Value) {
+	if v.Op != ssa.OpPhi {
+		v.Fatalf("CheckLoweredPhi called with non-phi value: %v", v.LongString())
+	}
+	if v.Type.IsMemory() {
+		return
+	}
+	f := v.Block.Func
+	loc := f.RegAlloc[v.ID]
+	for _, a := range v.Args {
+		if aloc := f.RegAlloc[a.ID]; aloc != loc { // TODO: .Equal() instead?
+			v.Fatalf("phi arg at different location than phi: %v @ %v, but arg %v @ %v\n%s\n", v, loc, a, aloc, v.Block.Func)
+		}
+	}
+}
+
+// CheckLoweredGetClosurePtr checks that v is the first instruction in the function's entry block.
+// The output of LoweredGetClosurePtr is generally hardwired to the correct register.
+// That register contains the closure pointer on closure entry.
+func CheckLoweredGetClosurePtr(v *ssa.Value) {
+	entry := v.Block.Func.Entry
+	if entry != v.Block || entry.Values[0] != v {
+		Fatalf("in %s, badly placed LoweredGetClosurePtr: %v %v", v.Block.Func.Name, v.Block, v)
 	}
-	return Thearch.SSARegToReg[reg.(*ssa.Register).Num]
 }

 // AutoVar returns a *Node and int64 representing the auto variable and offset within it
@@ -4364,6 +4485,25 @@ func (e *ssaExport) SplitComplex(name ssa.LocalSlot) (ssa.LocalSlot, ssa.LocalSl
 	return ssa.LocalSlot{N: n, Type: t, Off: name.Off}, ssa.LocalSlot{N: n, Type: t, Off: name.Off + s}
 }

+func (e *ssaExport) SplitInt64(name ssa.LocalSlot) (ssa.LocalSlot, ssa.LocalSlot) {
+	n := name.N.(*Node)
+	var t *Type
+	if name.Type.IsSigned() {
+		t = Types[TINT32]
+	} else {
+		t = Types[TUINT32]
+	}
+	if n.Class == PAUTO && !n.Addrtaken {
+		// Split this int64 up into two separate variables.
+		h := e.namedAuto(n.Sym.Name+".hi", t)
+		l := e.namedAuto(n.Sym.Name+".lo", Types[TUINT32])
+		return ssa.LocalSlot{N: h, Type: t, Off: 0}, ssa.LocalSlot{N: l, Type: Types[TUINT32], Off: 0}
+	}
+	// Return the two parts of the larger variable.
+	// Assuming little endian (we don't support big endian 32-bit architecture yet)
+	return ssa.LocalSlot{N: n, Type: t, Off: name.Off + 4}, ssa.LocalSlot{N: n, Type: Types[TUINT32], Off: name.Off}
+}
+
 func (e *ssaExport) SplitStruct(name ssa.LocalSlot, i int) ssa.LocalSlot {
 	n := name.N.(*Node)
 	st := name.Type
@@ -4381,7 +4521,7 @@ func (e *ssaExport) SplitStruct(name ssa.LocalSlot, i int) ssa.LocalSlot {
 // namedAuto returns a new AUTO variable with the given name and type.
 func (e *ssaExport) namedAuto(name string, typ ssa.Type) ssa.GCNode {
 	t := typ.(*Type)
-	s := &Sym{Name: name, Pkg: autopkg}
+	s := &Sym{Name: name, Pkg: localpkg}
 	n := Nod(ONAME, nil, nil)
 	s.Def = n
 	s.Def.Used = true
--- a/src/cmd/compile/internal/gc/subr.go
+++ b/src/cmd/compile/internal/gc/subr.go
@@ -246,6 +246,25 @@ func LookupN(prefix string, n int) *Sym {
 	return LookupBytes(b)
 }

+// autolabel generates a new Name node for use with
+// an automatically generated label.
+// prefix is a short mnemonic (e.g. ".s" for switch)
+// to help with debugging.
+// It should begin with "." to avoid conflicts with
+// user labels.
+func autolabel(prefix string) *Node {
+	if prefix[0] != '.' {
+		Fatalf("autolabel prefix must start with '.', have %q", prefix)
+	}
+	fn := Curfn
+	if Curfn == nil {
+		Fatalf("autolabel outside function")
+	}
+	n := fn.Func.Label
+	fn.Func.Label++
+	return newname(LookupN(prefix, int(n)))
+}
+
 var initSyms []*Sym

 var nopkg = &Pkg{
@@ -2298,6 +2317,16 @@ func isdirectiface(t *Type) bool {
 	return false
 }

+// itabType loads the _type field from a runtime.itab struct.
+func itabType(itab *Node) *Node {
+	typ := NodSym(ODOTPTR, itab, nil)
+	typ.Type = Ptrto(Types[TUINT8])
+	typ.Typecheck = 1
+	typ.Xoffset = int64(Widthptr) // offset of _type in runtime.itab
+	typ.Bounded = true            // guaranteed not to fault
+	return typ
+}
+
 // iet returns 'T' if t is a concrete type,
 // 'I' if t is an interface type, and 'E' if t is an empty interface type.
 // It is used to build calls to the conv* and assert* runtime routines.
--- a/src/cmd/compile/internal/gc/swt.go
+++ b/src/cmd/compile/internal/gc/swt.go
@@ -4,10 +4,7 @@

 package gc

-import (
-	"sort"
-	"strconv"
-)
+import "sort"

 const (
 	// expression switch
@@ -361,7 +358,7 @@ func casebody(sw *Node, typeswvar *Node) {
 		n.Op = OCASE
 		needvar := n.List.Len() != 1 || n.List.First().Op == OLITERAL

-		jmp := Nod(OGOTO, newCaseLabel(), nil)
+		jmp := Nod(OGOTO, autolabel(".s"), nil)
 		if n.List.Len() == 0 {
 			if def != nil {
 				Yyerror("more than one default case")
@@ -424,16 +421,6 @@ func casebody(sw *Node, typeswvar *Node) {
 	lineno = lno
 }

-// nSwitchLabel is the number of switch labels generated.
-// This should be per-function, but it is a global counter for now.
-var nSwitchLabel int
-
-func newCaseLabel() *Node {
-	label := strconv.Itoa(nSwitchLabel)
-	nSwitchLabel++
-	return newname(Lookup(label))
-}
-
 // caseClauses generates a slice of caseClauses
 // corresponding to the clauses in the switch statement sw.
 // Kind is the kind of switch statement.
@@ -590,7 +577,7 @@ func (s *typeSwitch) walk(sw *Node) {
 		i.Nbody.Set1(typenil)
 	} else {
 		// Jump to default case.
-		lbl := newCaseLabel()
+		lbl := autolabel(".s")
 		i.Nbody.Set1(Nod(OGOTO, lbl, nil))
 		// Wrap default case with label.
 		blk := Nod(OBLOCK, nil, nil)
@@ -602,11 +589,7 @@ func (s *typeSwitch) walk(sw *Node) {

 	if !cond.Right.Type.IsEmptyInterface() {
 		// Load type from itab.
-		typ = NodSym(ODOTPTR, typ, nil)
-		typ.Type = Ptrto(Types[TUINT8])
-		typ.Typecheck = 1
-		typ.Xoffset = int64(Widthptr) // offset of _type in runtime.itab
-		typ.Bounded = true            // guaranteed not to fault
+		typ = itabType(typ)
 	}
 	// Load hash from type.
 	h := NodSym(ODOTPTR, typ, nil)
--- a/src/cmd/compile/internal/gc/syntax.go
+++ b/src/cmd/compile/internal/gc/syntax.go
@@ -79,6 +79,7 @@ const (
 	hasBreak = 1 << iota
 	notLiveAtEnd
 	isClosureVar
+	isOutputParamHeapAddr
 )

 func (n *Node) HasBreak() bool {
@@ -112,6 +113,17 @@ func (n *Node) setIsClosureVar(b bool) {
 	}
 }

+func (n *Node) IsOutputParamHeapAddr() bool {
+	return n.flags&isOutputParamHeapAddr != 0
+}
+func (n *Node) setIsOutputParamHeapAddr(b bool) {
+	if b {
+		n.flags |= isOutputParamHeapAddr
+	} else {
+		n.flags &^= isOutputParamHeapAddr
+	}
+}
+
 // Val returns the Val for the node.
 func (n *Node) Val() Val {
 	if n.hasVal != +1 {
@@ -278,6 +290,8 @@ type Func struct {
 	InlCost int32
 	Depth   int32

+	Label int32 // largest auto-generated label in this function
+
 	Endlineno int32
 	WBLineno  int32 // line number of first write barrier

@@ -531,6 +545,11 @@ func (n *Nodes) Set1(node *Node) {
 	n.slice = &[]*Node{node}
 }

+// Set2 sets n to a slice containing two nodes.
+func (n *Nodes) Set2(n1, n2 *Node) {
+	n.slice = &[]*Node{n1, n2}
+}
+
 // MoveNodes sets n to the contents of n2, then clears n2.
 func (n *Nodes) MoveNodes(n2 *Nodes) {
 	n.slice = n2.slice
--- a/src/cmd/compile/internal/gc/testdata/arith_ssa.go
+++ b/src/cmd/compile/internal/gc/testdata/arith_ssa.go
@@ -553,6 +553,445 @@ func testOrPhi() {
 	}
 }

+//go:noinline
+func addshiftLL_ssa(a, b uint32) uint32 {
+	return a + b<<3
+}
+
+//go:noinline
+func subshiftLL_ssa(a, b uint32) uint32 {
+	return a - b<<3
+}
+
+//go:noinline
+func rsbshiftLL_ssa(a, b uint32) uint32 {
+	return a<<3 - b
+}
+
+//go:noinline
+func andshiftLL_ssa(a, b uint32) uint32 {
+	return a & (b << 3)
+}
+
+//go:noinline
+func orshiftLL_ssa(a, b uint32) uint32 {
+	return a | b<<3
+}
+
+//go:noinline
+func xorshiftLL_ssa(a, b uint32) uint32 {
+	return a ^ b<<3
+}
+
+//go:noinline
+func bicshiftLL_ssa(a, b uint32) uint32 {
+	return a &^ (b << 3)
+}
+
+//go:noinline
+func notshiftLL_ssa(a uint32) uint32 {
+	return ^(a << 3)
+}
+
+//go:noinline
+func addshiftRL_ssa(a, b uint32) uint32 {
+	return a + b>>3
+}
+
+//go:noinline
+func subshiftRL_ssa(a, b uint32) uint32 {
+	return a - b>>3
+}
+
+//go:noinline
+func rsbshiftRL_ssa(a, b uint32) uint32 {
+	return a>>3 - b
+}
+
+//go:noinline
+func andshiftRL_ssa(a, b uint32) uint32 {
+	return a & (b >> 3)
+}
+
+//go:noinline
+func orshiftRL_ssa(a, b uint32) uint32 {
+	return a | b>>3
+}
+
+//go:noinline
+func xorshiftRL_ssa(a, b uint32) uint32 {
+	return a ^ b>>3
+}
+
+//go:noinline
+func bicshiftRL_ssa(a, b uint32) uint32 {
+	return a &^ (b >> 3)
+}
+
+//go:noinline
+func notshiftRL_ssa(a uint32) uint32 {
+	return ^(a >> 3)
+}
+
+//go:noinline
+func addshiftRA_ssa(a, b int32) int32 {
+	return a + b>>3
+}
+
+//go:noinline
+func subshiftRA_ssa(a, b int32) int32 {
+	return a - b>>3
+}
+
+//go:noinline
+func rsbshiftRA_ssa(a, b int32) int32 {
+	return a>>3 - b
+}
+
+//go:noinline
+func andshiftRA_ssa(a, b int32) int32 {
+	return a & (b >> 3)
+}
+
+//go:noinline
+func orshiftRA_ssa(a, b int32) int32 {
+	return a | b>>3
+}
+
+//go:noinline
+func xorshiftRA_ssa(a, b int32) int32 {
+	return a ^ b>>3
+}
+
+//go:noinline
+func bicshiftRA_ssa(a, b int32) int32 {
+	return a &^ (b >> 3)
+}
+
+//go:noinline
+func notshiftRA_ssa(a int32) int32 {
+	return ^(a >> 3)
+}
+
+//go:noinline
+func addshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a + b<<s
+}
+
+//go:noinline
+func subshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a - b<<s
+}
+
+//go:noinline
+func rsbshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a<<s - b
+}
+
+//go:noinline
+func andshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a & (b << s)
+}
+
+//go:noinline
+func orshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a | b<<s
+}
+
+//go:noinline
+func xorshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a ^ b<<s
+}
+
+//go:noinline
+func bicshiftLLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a &^ (b << s)
+}
+
+//go:noinline
+func notshiftLLreg_ssa(a uint32, s uint8) uint32 {
+	return ^(a << s)
+}
+
+//go:noinline
+func addshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a + b>>s
+}
+
+//go:noinline
+func subshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a - b>>s
+}
+
+//go:noinline
+func rsbshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a>>s - b
+}
+
+//go:noinline
+func andshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a & (b >> s)
+}
+
+//go:noinline
+func orshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a | b>>s
+}
+
+//go:noinline
+func xorshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a ^ b>>s
+}
+
+//go:noinline
+func bicshiftRLreg_ssa(a, b uint32, s uint8) uint32 {
+	return a &^ (b >> s)
+}
+
+//go:noinline
+func notshiftRLreg_ssa(a uint32, s uint8) uint32 {
+	return ^(a >> s)
+}
+
+//go:noinline
+func addshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a + b>>s
+}
+
+//go:noinline
+func subshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a - b>>s
+}
+
+//go:noinline
+func rsbshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a>>s - b
+}
+
+//go:noinline
+func andshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a & (b >> s)
+}
+
+//go:noinline
+func orshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a | b>>s
+}
+
+//go:noinline
+func xorshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a ^ b>>s
+}
+
+//go:noinline
+func bicshiftRAreg_ssa(a, b int32, s uint8) int32 {
+	return a &^ (b >> s)
+}
+
+//go:noinline
+func notshiftRAreg_ssa(a int32, s uint8) int32 {
+	return ^(a >> s)
+}
+
+// test ARM shifted ops
+func testShiftedOps() {
+	a, b := uint32(10), uint32(42)
+	if want, got := a+b<<3, addshiftLL_ssa(a, b); got != want {
+		println("addshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b<<3, subshiftLL_ssa(a, b); got != want {
+		println("subshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a<<3-b, rsbshiftLL_ssa(a, b); got != want {
+		println("rsbshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b<<3), andshiftLL_ssa(a, b); got != want {
+		println("andshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b<<3, orshiftLL_ssa(a, b); got != want {
+		println("orshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b<<3, xorshiftLL_ssa(a, b); got != want {
+		println("xorshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b<<3), bicshiftLL_ssa(a, b); got != want {
+		println("bicshiftLL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a << 3), notshiftLL_ssa(a); got != want {
+		println("notshiftLL_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a+b>>3, addshiftRL_ssa(a, b); got != want {
+		println("addshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b>>3, subshiftRL_ssa(a, b); got != want {
+		println("subshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a>>3-b, rsbshiftRL_ssa(a, b); got != want {
+		println("rsbshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b>>3), andshiftRL_ssa(a, b); got != want {
+		println("andshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b>>3, orshiftRL_ssa(a, b); got != want {
+		println("orshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b>>3, xorshiftRL_ssa(a, b); got != want {
+		println("xorshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b>>3), bicshiftRL_ssa(a, b); got != want {
+		println("bicshiftRL_ssa(10, 42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a >> 3), notshiftRL_ssa(a); got != want {
+		println("notshiftRL_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	c, d := int32(10), int32(-42)
+	if want, got := c+d>>3, addshiftRA_ssa(c, d); got != want {
+		println("addshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c-d>>3, subshiftRA_ssa(c, d); got != want {
+		println("subshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c>>3-d, rsbshiftRA_ssa(c, d); got != want {
+		println("rsbshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&(d>>3), andshiftRA_ssa(c, d); got != want {
+		println("andshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c|d>>3, orshiftRA_ssa(c, d); got != want {
+		println("orshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c^d>>3, xorshiftRA_ssa(c, d); got != want {
+		println("xorshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&^(d>>3), bicshiftRA_ssa(c, d); got != want {
+		println("bicshiftRA_ssa(10, -42) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(d >> 3), notshiftRA_ssa(d); got != want {
+		println("notshiftRA_ssa(-42) =", got, " want ", want)
+		failed = true
+	}
+	s := uint8(3)
+	if want, got := a+b<<s, addshiftLLreg_ssa(a, b, s); got != want {
+		println("addshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b<<s, subshiftLLreg_ssa(a, b, s); got != want {
+		println("subshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a<<s-b, rsbshiftLLreg_ssa(a, b, s); got != want {
+		println("rsbshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b<<s), andshiftLLreg_ssa(a, b, s); got != want {
+		println("andshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b<<s, orshiftLLreg_ssa(a, b, s); got != want {
+		println("orshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b<<s, xorshiftLLreg_ssa(a, b, s); got != want {
+		println("xorshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b<<s), bicshiftLLreg_ssa(a, b, s); got != want {
+		println("bicshiftLLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a << s), notshiftLLreg_ssa(a, s); got != want {
+		println("notshiftLLreg_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a+b>>s, addshiftRLreg_ssa(a, b, s); got != want {
+		println("addshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a-b>>s, subshiftRLreg_ssa(a, b, s); got != want {
+		println("subshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a>>s-b, rsbshiftRLreg_ssa(a, b, s); got != want {
+		println("rsbshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&(b>>s), andshiftRLreg_ssa(a, b, s); got != want {
+		println("andshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a|b>>s, orshiftRLreg_ssa(a, b, s); got != want {
+		println("orshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a^b>>s, xorshiftRLreg_ssa(a, b, s); got != want {
+		println("xorshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := a&^(b>>s), bicshiftRLreg_ssa(a, b, s); got != want {
+		println("bicshiftRLreg_ssa(10, 42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(a >> s), notshiftRLreg_ssa(a, s); got != want {
+		println("notshiftRLreg_ssa(10) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c+d>>s, addshiftRAreg_ssa(c, d, s); got != want {
+		println("addshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c-d>>s, subshiftRAreg_ssa(c, d, s); got != want {
+		println("subshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c>>s-d, rsbshiftRAreg_ssa(c, d, s); got != want {
+		println("rsbshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&(d>>s), andshiftRAreg_ssa(c, d, s); got != want {
+		println("andshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c|d>>s, orshiftRAreg_ssa(c, d, s); got != want {
+		println("orshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c^d>>s, xorshiftRAreg_ssa(c, d, s); got != want {
+		println("xorshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := c&^(d>>s), bicshiftRAreg_ssa(c, d, s); got != want {
+		println("bicshiftRAreg_ssa(10, -42, 3) =", got, " want ", want)
+		failed = true
+	}
+	if want, got := ^(d >> s), notshiftRAreg_ssa(d, s); got != want {
+		println("notshiftRAreg_ssa(-42, 3) =", got, " want ", want)
+		failed = true
+	}
+}
+
 var failed = false

 func main() {
@@ -573,6 +1012,7 @@ func main() {
 	testLoadCombine()
 	testLoadSymCombine()
 	testShiftRemoval()
+	testShiftedOps()

 	if failed {
 		panic("failed")
--- a/src/cmd/compile/internal/gc/testdata/gen/constFoldGen.go
+++ b/src/cmd/compile/internal/gc/testdata/gen/constFoldGen.go
@@ -144,7 +144,7 @@ func main() {
 					fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 					want := ansU(c, d, s.name, o.symbol)
 					fmt.Fprintf(w, "\tif r != %s {\n", want)
-					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 					fmt.Fprintf(w, "\t}\n")
 				}
 			}
@@ -159,7 +159,7 @@ func main() {
 					fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 					want := ansS(c, d, s.name, o.symbol)
 					fmt.Fprintf(w, "\tif r != %s {\n", want)
-					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+					fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 					fmt.Fprintf(w, "\t}\n")
 				}
 			}
@@ -188,7 +188,7 @@ func main() {
 						fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 						want := ansU(c, d, ls.name, o.symbol)
 						fmt.Fprintf(w, "\tif r != %s {\n", want)
-						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 						fmt.Fprintf(w, "\t}\n")
 					}
 				}
@@ -200,7 +200,7 @@ func main() {
 						fmt.Fprintf(w, "\tr = x %s y\n", o.symbol)
 						want := ansS(c, int64(d), ls.name, o.symbol)
 						fmt.Fprintf(w, "\tif r != %s {\n", want)
-						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %s %d = %%d, want %s\", r)\n", c, o.symbol, d, want)
+						fmt.Fprintf(w, "\t\tt.Errorf(\"%d %%s %d = %%d, want %s\", %q, r)\n", c, d, want, o.symbol)
 						fmt.Fprintf(w, "\t}\n")
 					}
 				}
--- a/src/cmd/compile/internal/gc/testdata/string_ssa.go
+++ b/src/cmd/compile/internal/gc/testdata/string_ssa.go
@@ -110,6 +110,67 @@ func testSmallIndexType() {
 	}
 }

+//go:noinline
+func testInt64Index_ssa(s string, i int64) byte {
+	return s[i]
+}
+
+//go:noinline
+func testInt64Slice_ssa(s string, i, j int64) string {
+	return s[i:j]
+}
+
+func testInt64Index() {
+	tests := []struct {
+		i int64
+		j int64
+		b byte
+		s string
+	}{
+		{0, 5, 'B', "Below"},
+		{5, 10, 'E', "Exact"},
+		{10, 15, 'A', "Above"},
+	}
+
+	str := "BelowExactAbove"
+	for i, t := range tests {
+		if got := testInt64Index_ssa(str, t.i); got != t.b {
+			println("#", i, "got ", got, ", wanted", t.b)
+			failed = true
+		}
+		if got := testInt64Slice_ssa(str, t.i, t.j); got != t.s {
+			println("#", i, "got ", got, ", wanted", t.s)
+			failed = true
+		}
+	}
+}
+
+func testInt64IndexPanic() {
+	defer func() {
+		if r := recover(); r != nil {
+			println("paniced as expected")
+		}
+	}()
+
+	str := "foobar"
+	println("got ", testInt64Index_ssa(str, 1<<32+1))
+	println("expected to panic, but didn't")
+	failed = true
+}
+
+func testInt64SlicePanic() {
+	defer func() {
+		if r := recover(); r != nil {
+			println("paniced as expected")
+		}
+	}()
+
+	str := "foobar"
+	println("got ", testInt64Slice_ssa(str, 1<<32, 1<<32+1))
+	println("expected to panic, but didn't")
+	failed = true
+}
+
 //go:noinline
 func testStringElem_ssa(s string, i int) byte {
 	return s[i]
@@ -153,6 +214,9 @@ func main() {
 	testSmallIndexType()
 	testStringElem()
 	testStringElemConst()
+	testInt64Index()
+	testInt64IndexPanic()
+	testInt64SlicePanic()

 	if failed {
 		panic("failed")
--- a/src/cmd/compile/internal/gc/type.go
+++ b/src/cmd/compile/internal/gc/type.go
@@ -1207,6 +1207,7 @@ func (t *Type) ChanDir() ChanDir {
 func (t *Type) IsMemory() bool { return false }
 func (t *Type) IsFlags() bool  { return false }
 func (t *Type) IsVoid() bool   { return false }
+func (t *Type) IsTuple() bool  { return false }

 // IsUntyped reports whether t is an untyped type.
 func (t *Type) IsUntyped() bool {
--- a/src/cmd/compile/internal/gc/typecheck.go
+++ b/src/cmd/compile/internal/gc/typecheck.go
@@ -3185,6 +3185,9 @@ func samesafeexpr(l *Node, r *Node) bool {

 	case OINDEX:
 		return samesafeexpr(l.Left, r.Left) && samesafeexpr(l.Right, r.Right)
+
+	case OLITERAL:
+		return eqval(l.Val(), r.Val())
 	}

 	return false
--- a/src/cmd/compile/internal/gc/walk.go
+++ b/src/cmd/compile/internal/gc/walk.go
@@ -506,7 +506,9 @@ func walkexpr(n *Node, init *Nodes) *Node {
 	if n.Op == ONAME && n.Class == PAUTOHEAP {
 		nn := Nod(OIND, n.Name.Heapaddr, nil)
 		nn = typecheck(nn, Erv)
-		return walkexpr(nn, init)
+		nn = walkexpr(nn, init)
+		nn.Left.NonNil = true
+		return nn
 	}

 opswitch:
@@ -958,19 +960,27 @@ opswitch:
 		fromKind := from.Type.iet()
 		toKind := t.iet()

+		res := n.List.First()
+
 		// Avoid runtime calls in a few cases of the form _, ok := i.(T).
 		// This is faster and shorter and allows the corresponding assertX2X2
 		// routines to skip nil checks on their last argument.
-		if isblank(n.List.First()) {
+		if isblank(res) {
 			var fast *Node
-			switch {
-			case fromKind == 'E' && toKind == 'T':
-				tab := Nod(OITAB, from, nil) // type:eface::tab:iface
-				typ := Nod(OCONVNOP, typename(t), nil)
-				typ.Type = Ptrto(Types[TUINTPTR])
-				fast = Nod(OEQ, tab, typ)
-			case fromKind == 'I' && toKind == 'E',
-				fromKind == 'E' && toKind == 'E':
+			switch toKind {
+			case 'T':
+				tab := Nod(OITAB, from, nil)
+				if fromKind == 'E' {
+					typ := Nod(OCONVNOP, typename(t), nil)
+					typ.Type = Ptrto(Types[TUINTPTR])
+					fast = Nod(OEQ, tab, typ)
+					break
+				}
+				fast = Nod(OANDAND,
+					Nod(ONE, nodnil(), tab),
+					Nod(OEQ, itabType(tab), typename(t)),
+				)
+			case 'E':
 				tab := Nod(OITAB, from, nil)
 				fast = Nod(ONE, nodnil(), tab)
 			}
@@ -985,10 +995,10 @@ opswitch:
 		}

 		var resptr *Node // &res
-		if isblank(n.List.First()) {
+		if isblank(res) {
 			resptr = nodnil()
 		} else {
-			resptr = Nod(OADDR, n.List.First(), nil)
+			resptr = Nod(OADDR, res, nil)
 		}
 		resptr.Etype = 1 // addr does not escape

@@ -1094,12 +1104,45 @@ opswitch:

 			if n.Type.IsFloat() {
 				if n.Left.Type.Etype == TINT64 {
-					n = mkcall("int64tofloat64", n.Type, init, conv(n.Left, Types[TINT64]))
+					n = conv(mkcall("int64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TINT64])), n.Type)
 					break
 				}

 				if n.Left.Type.Etype == TUINT64 {
-					n = mkcall("uint64tofloat64", n.Type, init, conv(n.Left, Types[TUINT64]))
+					n = conv(mkcall("uint64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TUINT64])), n.Type)
+					break
+				}
+			}
+		}
+
+		if Thearch.LinkArch.Family == sys.I386 {
+			if n.Left.Type.IsFloat() {
+				if n.Type.Etype == TINT64 {
+					n = mkcall("float64toint64", n.Type, init, conv(n.Left, Types[TFLOAT64]))
+					break
+				}
+
+				if n.Type.Etype == TUINT64 {
+					n = mkcall("float64touint64", n.Type, init, conv(n.Left, Types[TFLOAT64]))
+					break
+				}
+				if n.Type.Etype == TUINT32 || n.Type.Etype == TUINT || n.Type.Etype == TUINTPTR {
+					n = mkcall("float64touint32", n.Type, init, conv(n.Left, Types[TFLOAT64]))
+					break
+				}
+			}
+			if n.Type.IsFloat() {
+				if n.Left.Type.Etype == TINT64 {
+					n = conv(mkcall("int64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TINT64])), n.Type)
+					break
+				}
+
+				if n.Left.Type.Etype == TUINT64 {
+					n = conv(mkcall("uint64tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TUINT64])), n.Type)
+					break
+				}
+				if n.Left.Type.Etype == TUINT32 || n.Left.Type.Etype == TUINT || n.Left.Type.Etype == TUINTPTR {
+					n = conv(mkcall("uint32tofloat64", Types[TFLOAT64], init, conv(n.Left, Types[TUINT32])), n.Type)
 					break
 				}
 			}
@@ -3303,6 +3346,7 @@ func samecheap(a *Node, b *Node) bool {
 // The result of walkrotate MUST be assigned back to n, e.g.
 // 	n.Left = walkrotate(n.Left)
 func walkrotate(n *Node) *Node {
+	//TODO: enable LROT on ARM64 once the old backend is gone
 	if Thearch.LinkArch.InFamily(sys.MIPS64, sys.ARM64, sys.PPC64) {
 		return n
 	}
@@ -3496,16 +3540,6 @@ func walkdiv(n *Node, init *Nodes) *Node {
 			goto ret
 		}

-		// TODO(zhongwei) Test shows that TUINT8, TINT8, TUINT16 and TINT16's "quick division" method
-		// on current arm64 backend is slower than hardware div instruction on ARM64 due to unnecessary
-		// data movement between registers. It could be enabled when generated code is good enough.
-		if Thearch.LinkArch.Family == sys.ARM64 {
-			switch Simtype[nl.Type.Etype] {
-			case TUINT8, TINT8, TUINT16, TINT16:
-				return n
-			}
-		}
-
 		switch Simtype[nl.Type.Etype] {
 		default:
 			return n
--- a/src/cmd/compile/internal/mips64/peep.go
+++ b/src/cmd/compile/internal/mips64/peep.go
@@ -402,7 +402,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case obj.ANOP, /* read p->from, write p->to */
--- a/src/cmd/compile/internal/ppc64/galign.go
+++ b/src/cmd/compile/internal/ppc64/galign.go
@@ -66,6 +66,11 @@ func Main() {
 	gc.Thearch.Doregbits = doregbits
 	gc.Thearch.Regnames = regnames

+	gc.Thearch.SSARegToReg = ssaRegToReg
+	gc.Thearch.SSAMarkMoves = ssaMarkMoves
+	gc.Thearch.SSAGenValue = ssaGenValue
+	gc.Thearch.SSAGenBlock = ssaGenBlock
+
 	initvariants()
 	initproginfo()

--- a/src/cmd/compile/internal/ppc64/gsubr.go
+++ b/src/cmd/compile/internal/ppc64/gsubr.go
@@ -441,9 +441,8 @@ func gmove(f *gc.Node, t *gc.Node) {
 		gc.Regfree(&r3)
 		return

-		//warn("gmove: convert int to float not implemented: %N -> %N\n", f, t);
 	/*
-	 * signed integer to float
+	 * integer to float
 	 */
 	case gc.TINT32<<16 | gc.TFLOAT32,
 		gc.TINT32<<16 | gc.TFLOAT64,
@@ -452,10 +451,42 @@ func gmove(f *gc.Node, t *gc.Node) {
 		gc.TINT16<<16 | gc.TFLOAT32,
 		gc.TINT16<<16 | gc.TFLOAT64,
 		gc.TINT8<<16 | gc.TFLOAT32,
-		gc.TINT8<<16 | gc.TFLOAT64:
+		gc.TINT8<<16 | gc.TFLOAT64,
+		gc.TUINT16<<16 | gc.TFLOAT32,
+		gc.TUINT16<<16 | gc.TFLOAT64,
+		gc.TUINT8<<16 | gc.TFLOAT32,
+		gc.TUINT8<<16 | gc.TFLOAT64,
+		gc.TUINT32<<16 | gc.TFLOAT32,
+		gc.TUINT32<<16 | gc.TFLOAT64,
+		gc.TUINT64<<16 | gc.TFLOAT32,
+		gc.TUINT64<<16 | gc.TFLOAT64:
+		bignodes()
+
+		// The algorithm is:
+		//	if small enough, use native int64 -> float64 conversion,
+		//	otherwise halve (x -> (x>>1)|(x&1)), convert, and double.
+		// Note: could use FCFIDU instead if target supports it.
 		var r1 gc.Node
 		gc.Regalloc(&r1, gc.Types[gc.TINT64], nil)
 		gmove(f, &r1)
+		if ft == gc.TUINT64 {
+			gc.Nodreg(&r2, gc.Types[gc.TUINT64], ppc64.REGTMP)
+			gmove(&bigi, &r2)
+			gins(ppc64.ACMPU, &r1, &r2)
+			p1 := gc.Gbranch(optoas(gc.OLT, gc.Types[gc.TUINT64]), nil, +1)
+			var r3 gc.Node
+			gc.Regalloc(&r3, gc.Types[gc.TUINT64], nil)
+			p2 := gins(ppc64.AANDCC, nil, &r3) // andi.
+			p2.Reg = r1.Reg
+			p2.From.Type = obj.TYPE_CONST
+			p2.From.Offset = 1
+			p3 := gins(ppc64.ASRD, nil, &r1)
+			p3.From.Type = obj.TYPE_CONST
+			p3.From.Offset = 1
+			gins(ppc64.AOR, &r3, &r1)
+			gc.Regfree(&r3)
+			gc.Patch(p1, gc.Pc)
+		}
 		gc.Regalloc(&r2, gc.Types[gc.TFLOAT64], t)
 		p1 := gins(ppc64.AMOVD, &r1, nil)
 		p1.To.Type = obj.TYPE_MEM
@@ -467,36 +498,12 @@ func gmove(f *gc.Node, t *gc.Node) {
 		p1.From.Offset = -8
 		gins(ppc64.AFCFID, &r2, &r2)
 		gc.Regfree(&r1)
-		gmove(&r2, t)
-		gc.Regfree(&r2)
-		return
-
-	/*
-	 * unsigned integer to float
-	 */
-	case gc.TUINT16<<16 | gc.TFLOAT32,
-		gc.TUINT16<<16 | gc.TFLOAT64,
-		gc.TUINT8<<16 | gc.TFLOAT32,
-		gc.TUINT8<<16 | gc.TFLOAT64,
-		gc.TUINT32<<16 | gc.TFLOAT32,
-		gc.TUINT32<<16 | gc.TFLOAT64,
-		gc.TUINT64<<16 | gc.TFLOAT32,
-		gc.TUINT64<<16 | gc.TFLOAT64:
-
-		var r1 gc.Node
-		gc.Regalloc(&r1, gc.Types[gc.TUINT64], nil)
-		gmove(f, &r1)
-		gc.Regalloc(&r2, gc.Types[gc.TFLOAT64], t)
-		p1 := gins(ppc64.AMOVD, &r1, nil)
-		p1.To.Type = obj.TYPE_MEM
-		p1.To.Reg = ppc64.REGSP
-		p1.To.Offset = -8
-		p1 = gins(ppc64.AFMOVD, nil, &r2)
-		p1.From.Type = obj.TYPE_MEM
-		p1.From.Reg = ppc64.REGSP
-		p1.From.Offset = -8
-		gins(ppc64.AFCFIDU, &r2, &r2)
-		gc.Regfree(&r1)
+		if ft == gc.TUINT64 {
+			p1 := gc.Gbranch(optoas(gc.OLT, gc.Types[gc.TUINT64]), nil, +1) // use CR0 here again
+			gc.Nodreg(&r1, gc.Types[gc.TFLOAT64], ppc64.FREGTWO)
+			gins(ppc64.AFMUL, &r1, &r2)
+			gc.Patch(p1, gc.Pc)
+		}
 		gmove(&r2, t)
 		gc.Regfree(&r2)
 		return
--- a/src/cmd/compile/internal/ppc64/peep.go
+++ b/src/cmd/compile/internal/ppc64/peep.go
@@ -601,7 +601,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return 2

 	case obj.ANOP, /* read p->from, write p->to */
--- a/src/cmd/compile/internal/ppc64/prog.go
+++ b/src/cmd/compile/internal/ppc64/prog.go
@@ -42,22 +42,34 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{

 	// Integer
 	ppc64.AADD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AADDC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASUB & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AADDME & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ANEG & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AAND & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AANDN & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AOR & obj.AMask:     {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AORN & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AXOR & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AEQV & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULLD & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULLW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULHD & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULHDU & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ADIVD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ADIVDU & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ADIVW & obj.AMask:   {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ADIVWU & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASLD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASRD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASRAD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ASLW & obj.AMask:    {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ASRW & obj.AMask:    {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.ASRAW & obj.AMask:   {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ACMP & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RightRead},
 	ppc64.ACMPU & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightRead},
+	ppc64.ACMPW & obj.AMask:   {Flags: gc.SizeL | gc.LeftRead | gc.RightRead},
+	ppc64.ACMPWU & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RightRead},
 	ppc64.ATD & obj.AMask:     {Flags: gc.SizeQ | gc.RightRead},

 	// Floating point.
@@ -70,11 +82,13 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{
 	ppc64.AFDIV & obj.AMask:   {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFDIVS & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCTIDZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AFCTIWZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCFID & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCFIDU & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AFCMPU & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RightRead},
 	ppc64.AFRSP & obj.AMask:   {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv},
 	ppc64.AFSQRT & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite},
+	ppc64.AFNEG & obj.AMask:   {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite},

 	// Moves
 	ppc64.AMOVB & obj.AMask:  {Flags: gc.SizeB | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
@@ -91,6 +105,8 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{
 	ppc64.AMOVD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move},
 	ppc64.AMOVDU & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move | gc.PostInc},
 	ppc64.AFMOVS & obj.AMask:  {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
+	ppc64.AFMOVSX & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
+	ppc64.AFMOVSZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
 	ppc64.AFMOVD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Move},

 	// Jumps
@@ -297,7 +313,7 @@ func as2variant(as obj.As) int {
 			return i
 		}
 	}
-	gc.Fatalf("as2variant: instruction %v is not a variant of itself", obj.Aconv(as&obj.AMask))
+	gc.Fatalf("as2variant: instruction %v is not a variant of itself", as&obj.AMask)
 	return 0
 }

--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
--- a/src/cmd/compile/internal/s390x/peep.go
+++ b/src/cmd/compile/internal/s390x/peep.go
@@ -419,7 +419,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) usage {

 	switch p.As {
 	default:
-		fmt.Printf("copyu: can't find %v\n", obj.Aconv(p.As))
+		fmt.Printf("copyu: can't find %v\n", p.As)
 		return _ReadWriteSame

 	case // read p.From, write p.To
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@@ -270,6 +270,7 @@ var passes = [...]pass{
 	{name: "checkLower", fn: checkLower, required: true},
 	{name: "late phielim", fn: phielim},
 	{name: "late copyelim", fn: copyelim},
+	{name: "phi tighten", fn: phiTighten},
 	{name: "late deadcode", fn: deadcode},
 	{name: "critical", fn: critical, required: true}, // remove critical edges
 	{name: "likelyadjust", fn: likelyadjust},
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -20,11 +20,18 @@ type Config struct {
 	lowerBlock      func(*Block) bool          // lowering function
 	lowerValue      func(*Value, *Config) bool // lowering function
 	registers       []Register                 // machine registers
+	gpRegMask       regMask                    // general purpose integer register mask
+	fpRegMask       regMask                    // floating point register mask
+	FPReg           int8                       // register number of frame pointer, -1 if not used
+	hasGReg         bool                       // has hardware g register
 	fe              Frontend                   // callbacks into compiler frontend
 	HTML            *HTMLWriter                // html writer, for debugging
 	ctxt            *obj.Link                  // Generic arch information
 	optimize        bool                       // Do optimization
 	noDuffDevice    bool                       // Don't use Duff's device
+	nacl            bool                       // GOOS=nacl
+	use387          bool                       // GO386=387
+	NeedsFpScratch  bool                       // No direct move between GP and FP register sets
 	sparsePhiCutoff uint64                     // Sparse phi location algorithm used above this #blocks*#variables score
 	curFunc         *Func

@@ -106,6 +113,7 @@ type Frontend interface {
 	SplitSlice(LocalSlot) (LocalSlot, LocalSlot, LocalSlot)
 	SplitComplex(LocalSlot) (LocalSlot, LocalSlot)
 	SplitStruct(LocalSlot, int) LocalSlot
+	SplitInt64(LocalSlot) (LocalSlot, LocalSlot) // returns (hi, lo)

 	// Line returns a string describing the given line number.
 	Line(int32) string
@@ -128,29 +136,88 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 		c.lowerBlock = rewriteBlockAMD64
 		c.lowerValue = rewriteValueAMD64
 		c.registers = registersAMD64[:]
-	case "386":
+		c.gpRegMask = gpRegMaskAMD64
+		c.fpRegMask = fpRegMaskAMD64
+		c.FPReg = framepointerRegAMD64
+		c.hasGReg = false
+	case "amd64p32":
 		c.IntSize = 4
 		c.PtrSize = 4
 		c.lowerBlock = rewriteBlockAMD64
-		c.lowerValue = rewriteValueAMD64 // TODO(khr): full 32-bit support
+		c.lowerValue = rewriteValueAMD64
+		c.registers = registersAMD64[:]
+		c.gpRegMask = gpRegMaskAMD64
+		c.fpRegMask = fpRegMaskAMD64
+		c.FPReg = framepointerRegAMD64
+		c.hasGReg = false
+		c.noDuffDevice = true
+	case "386":
+		c.IntSize = 4
+		c.PtrSize = 4
+		c.lowerBlock = rewriteBlock386
+		c.lowerValue = rewriteValue386
+		c.registers = registers386[:]
+		c.gpRegMask = gpRegMask386
+		c.fpRegMask = fpRegMask386
+		c.FPReg = framepointerReg386
+		c.hasGReg = false
 	case "arm":
 		c.IntSize = 4
 		c.PtrSize = 4
 		c.lowerBlock = rewriteBlockARM
 		c.lowerValue = rewriteValueARM
 		c.registers = registersARM[:]
+		c.gpRegMask = gpRegMaskARM
+		c.fpRegMask = fpRegMaskARM
+		c.FPReg = framepointerRegARM
+		c.hasGReg = true
+	case "arm64":
+		c.IntSize = 8
+		c.PtrSize = 8
+		c.lowerBlock = rewriteBlockARM64
+		c.lowerValue = rewriteValueARM64
+		c.registers = registersARM64[:]
+		c.gpRegMask = gpRegMaskARM64
+		c.fpRegMask = fpRegMaskARM64
+		c.FPReg = framepointerRegARM64
+		c.hasGReg = true
+		c.noDuffDevice = obj.Getgoos() == "darwin" // darwin linker cannot handle BR26 reloc with non-zero addend
+	case "ppc64le":
+		c.IntSize = 8
+		c.PtrSize = 8
+		c.lowerBlock = rewriteBlockPPC64
+		c.lowerValue = rewriteValuePPC64
+		c.registers = registersPPC64[:]
+		c.gpRegMask = gpRegMaskPPC64
+		c.fpRegMask = fpRegMaskPPC64
+		c.FPReg = framepointerRegPPC64
+		c.noDuffDevice = true // TODO: Resolve PPC64 DuffDevice (has zero, but not copy)
+		c.NeedsFpScratch = true
+		c.hasGReg = true
 	default:
 		fe.Unimplementedf(0, "arch %s not implemented", arch)
 	}
 	c.ctxt = ctxt
 	c.optimize = optimize
+	c.nacl = obj.Getgoos() == "nacl"

-	// Don't use Duff's device on Plan 9, because floating
+	// Don't use Duff's device on Plan 9 AMD64, because floating
 	// point operations are not allowed in note handler.
-	if obj.Getgoos() == "plan9" {
+	if obj.Getgoos() == "plan9" && arch == "amd64" {
 		c.noDuffDevice = true
 	}

+	if c.nacl {
+		c.noDuffDevice = true // Don't use Duff's device on NaCl
+
+		// ARM assembler rewrites DIV/MOD to runtime calls, which
+		// clobber R12 on nacl
+		opcodeTable[OpARMDIV].reg.clobbers |= 1 << 12  // R12
+		opcodeTable[OpARMDIVU].reg.clobbers |= 1 << 12 // R12
+		opcodeTable[OpARMMOD].reg.clobbers |= 1 << 12  // R12
+		opcodeTable[OpARMMODU].reg.clobbers |= 1 << 12 // R12
+	}
+
 	// Assign IDs to preallocated values/blocks.
 	for i := range c.values {
 		c.values[i].ID = ID(i)
@@ -180,6 +247,11 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 	return c
 }

+func (c *Config) Set387(b bool) {
+	c.NeedsFpScratch = b
+	c.use387 = b
+}
+
 func (c *Config) Frontend() Frontend      { return c.fe }
 func (c *Config) SparsePhiCutoff() uint64 { return c.sparsePhiCutoff }

--- a/src/cmd/compile/internal/ssa/cse.go
+++ b/src/cmd/compile/internal/ssa/cse.go
@@ -163,6 +163,29 @@ func cse(f *Func) {
 		}
 	}

+	// if we rewrite a tuple generator to a new one in a different block,
+	// copy its selectors to the new generator's block, so tuple generator
+	// and selectors stay together.
+	for _, b := range f.Blocks {
+		for _, v := range b.Values {
+			if rewrite[v.ID] != nil {
+				continue
+			}
+			if v.Op != OpSelect0 && v.Op != OpSelect1 {
+				continue
+			}
+			if !v.Args[0].Type.IsTuple() {
+				f.Fatalf("arg of tuple selector %s is not a tuple: %s", v.String(), v.Args[0].LongString())
+			}
+			t := rewrite[v.Args[0].ID]
+			if t != nil && t.Block != b {
+				// v.Args[0] is tuple generator, CSE'd into a different block as t, v is left behind
+				c := v.copyInto(t.Block)
+				rewrite[v.ID] = c
+			}
+		}
+	}
+
 	rewrites := int64(0)

 	// Apply substitutions
--- a/src/cmd/compile/internal/ssa/deadstore.go
+++ b/src/cmd/compile/internal/ssa/deadstore.go
@@ -89,7 +89,7 @@ func dse(f *Func) {
 				} else {
 					// zero addr mem
 					sz := v.Args[0].Type.ElemType().Size()
-					if v.AuxInt != sz {
+					if SizeAndAlign(v.AuxInt).Size() != sz {
 						f.Fatalf("mismatched zero/store sizes: %d and %d [%s]",
 							v.AuxInt, sz, v.LongString())
 					}
--- a/src/cmd/compile/internal/ssa/decompose.go
+++ b/src/cmd/compile/internal/ssa/decompose.go
@@ -25,6 +25,22 @@ func decomposeBuiltIn(f *Func) {
 	for _, name := range f.Names {
 		t := name.Type
 		switch {
+		case t.IsInteger() && t.Size() == 8 && f.Config.IntSize == 4:
+			var elemType Type
+			if t.IsSigned() {
+				elemType = f.Config.fe.TypeInt32()
+			} else {
+				elemType = f.Config.fe.TypeUInt32()
+			}
+			hiName, loName := f.Config.fe.SplitInt64(name)
+			newNames = append(newNames, hiName, loName)
+			for _, v := range f.NamedValues[name] {
+				hi := v.Block.NewValue1(v.Line, OpInt64Hi, elemType, v)
+				lo := v.Block.NewValue1(v.Line, OpInt64Lo, f.Config.fe.TypeUInt32(), v)
+				f.NamedValues[hiName] = append(f.NamedValues[hiName], hi)
+				f.NamedValues[loName] = append(f.NamedValues[loName], lo)
+			}
+			delete(f.NamedValues, name)
 		case t.IsComplex():
 			var elemType Type
 			if t.Size() == 16 {
@@ -78,6 +94,8 @@ func decomposeBuiltIn(f *Func) {
 				f.NamedValues[dataName] = append(f.NamedValues[dataName], data)
 			}
 			delete(f.NamedValues, name)
+		case t.IsFloat():
+			// floats are never decomposed, even ones bigger than IntSize
 		case t.Size() > f.Config.IntSize:
 			f.Unimplementedf("undecomposed named type %s %s", name, t)
 		default:
@@ -88,8 +106,13 @@ func decomposeBuiltIn(f *Func) {
 }

 func decomposeBuiltInPhi(v *Value) {
-	// TODO: decompose 64-bit ops on 32-bit archs?
 	switch {
+	case v.Type.IsInteger() && v.Type.Size() == 8 && v.Block.Func.Config.IntSize == 4:
+		if v.Block.Func.Config.arch == "amd64p32" {
+			// Even though ints are 32 bits, we have 64-bit ops.
+			break
+		}
+		decomposeInt64Phi(v)
 	case v.Type.IsComplex():
 		decomposeComplexPhi(v)
 	case v.Type.IsString():
@@ -98,6 +121,8 @@ func decomposeBuiltInPhi(v *Value) {
 		decomposeSlicePhi(v)
 	case v.Type.IsInterface():
 		decomposeInterfacePhi(v)
+	case v.Type.IsFloat():
+		// floats are never decomposed, even ones bigger than IntSize
 	case v.Type.Size() > v.Block.Func.Config.IntSize:
 		v.Unimplementedf("undecomposed type %s", v.Type)
 	}
@@ -138,6 +163,26 @@ func decomposeSlicePhi(v *Value) {
 	v.AddArg(cap)
 }

+func decomposeInt64Phi(v *Value) {
+	fe := v.Block.Func.Config.fe
+	var partType Type
+	if v.Type.IsSigned() {
+		partType = fe.TypeInt32()
+	} else {
+		partType = fe.TypeUInt32()
+	}
+
+	hi := v.Block.NewValue0(v.Line, OpPhi, partType)
+	lo := v.Block.NewValue0(v.Line, OpPhi, fe.TypeUInt32())
+	for _, a := range v.Args {
+		hi.AddArg(a.Block.NewValue1(v.Line, OpInt64Hi, partType, a))
+		lo.AddArg(a.Block.NewValue1(v.Line, OpInt64Lo, fe.TypeUInt32(), a))
+	}
+	v.reset(OpInt64Make)
+	v.AddArg(hi)
+	v.AddArg(lo)
+}
+
 func decomposeComplexPhi(v *Value) {
 	fe := v.Block.Func.Config.fe
 	var partType Type
--- a/src/cmd/compile/internal/ssa/export_test.go
+++ b/src/cmd/compile/internal/ssa/export_test.go
@@ -49,6 +49,12 @@ func (d DummyFrontend) SplitComplex(s LocalSlot) (LocalSlot, LocalSlot) {
 	}
 	return LocalSlot{s.N, d.TypeFloat32(), s.Off}, LocalSlot{s.N, d.TypeFloat32(), s.Off + 4}
 }
+func (d DummyFrontend) SplitInt64(s LocalSlot) (LocalSlot, LocalSlot) {
+	if s.Type.IsSigned() {
+		return LocalSlot{s.N, d.TypeInt32(), s.Off + 4}, LocalSlot{s.N, d.TypeUInt32(), s.Off}
+	}
+	return LocalSlot{s.N, d.TypeUInt32(), s.Off + 4}, LocalSlot{s.N, d.TypeUInt32(), s.Off}
+}
 func (d DummyFrontend) SplitStruct(s LocalSlot, i int) LocalSlot {
 	return LocalSlot{s.N, s.Type.FieldType(i), s.Off + s.Type.FieldOff(i)}
 }
--- a/src/cmd/compile/internal/ssa/flagalloc.go
+++ b/src/cmd/compile/internal/ssa/flagalloc.go
@@ -4,8 +4,6 @@

 package ssa

-const flagRegMask = regMask(1) << 33 // TODO: arch-specific
-
 // flagalloc allocates the flag register among all the flag-generating
 // instructions. Flag values are recomputed if they need to be
 // spilled/restored.
@@ -33,7 +31,7 @@ func flagalloc(f *Func) {
 				if v == flag {
 					flag = nil
 				}
-				if opcodeTable[v.Op].reg.clobbers&flagRegMask != 0 {
+				if opcodeTable[v.Op].clobberFlags {
 					flag = nil
 				}
 				for _, a := range v.Args {
@@ -97,7 +95,7 @@ func flagalloc(f *Func) {
 					continue
 				}
 				// Recalculate a
-				c := a.copyInto(b)
+				c := copyFlags(a, b)
 				// Update v.
 				v.SetArg(i, c)
 				// Remember the most-recently computed flag value.
@@ -105,7 +103,7 @@ func flagalloc(f *Func) {
 			}
 			// Issue v.
 			b.Values = append(b.Values, v)
-			if opcodeTable[v.Op].reg.clobbers&flagRegMask != 0 {
+			if opcodeTable[v.Op].clobberFlags {
 				flag = nil
 			}
 			if v.Type.IsFlags() {
@@ -121,7 +119,7 @@ func flagalloc(f *Func) {
 		if v := end[b.ID]; v != nil && v != flag {
 			// Need to reissue flag generator for use by
 			// subsequent blocks.
-			_ = v.copyInto(b)
+			copyFlags(v, b)
 			// Note: this flag generator is not properly linked up
 			// with the flag users. This breaks the SSA representation.
 			// We could fix up the users with another pass, but for now
@@ -135,3 +133,19 @@ func flagalloc(f *Func) {
 		b.FlagsLiveAtEnd = end[b.ID] != nil
 	}
 }
+
+// copyFlags copies v (flag generator) into b, returns the copy.
+// If v's arg is also flags, copy recursively.
+func copyFlags(v *Value, b *Block) *Value {
+	flagsArgs := make(map[int]*Value)
+	for i, a := range v.Args {
+		if a.Type.IsFlags() || a.Type.IsTuple() {
+			flagsArgs[i] = copyFlags(a, b)
+		}
+	}
+	c := v.copyInto(b)
+	for i, a := range flagsArgs {
+		c.SetArg(i, a)
+	}
+	return c
+}
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -0,0 +1,508 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - Floating-point types live in the low natural slot of an sse2 register.
+//    Unused portions are junk.
+//  - We do not use AH,BH,CH,DH registers.
+//  - When doing sub-register operations, we try to write the whole
+//    destination register to avoid a partial-register write.
+//  - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+//    filled by sign-extending the used portion.  Users of AuxInt which interpret
+//    AuxInt as unsigned (e.g. shifts) must be careful.
+
+// Suffixes encode the bit width of various instructions.
+// L (long word) = 32 bit
+// W (word)      = 16 bit
+// B (byte)      = 8 bit
+
+// copied from ../../x86/reg.go
+var regNames386 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+
+	// pseudo-registers
+	"SB",
+}
+
+// Notes on 387 support.
+//  - The 387 has a weird stack-register setup for floating-point registers.
+//    We use these registers when SSE registers are not available (when GO386=387).
+//  - We use the same register names (X0-X7) but they refer to the 387
+//    floating-point registers. That way, most of the SSA backend is unchanged.
+//  - The instruction generation pass maintains an SSE->387 register mapping.
+//    This mapping is updated whenever the FP stack is pushed or popped so that
+//    we can always find a given SSE register even when the TOS pointer has changed.
+//  - To facilitate the mapping from SSE to 387, we enforce that
+//    every basic block starts and ends with an empty floating-point stack.
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNames386) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNames386 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		ax         = buildReg("AX")
+		cx         = buildReg("CX")
+		dx         = buildReg("DX")
+		gp         = buildReg("AX CX DX BX BP SI DI")
+		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7")
+		x7         = buildReg("X7")
+		gpsp       = gp | buildReg("SP")
+		gpspsb     = gpsp | buildReg("SB")
+		callerSave = gp | fp
+	)
+	// Common slices of register masks
+	var (
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
+	)
+
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: gponly}
+		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
+		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp11carry = regInfo{inputs: []regMask{gp}, outputs: []regMask{0, gp}}
+		gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{0, gp}}
+		gp1carry1 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp2carry1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
+		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax}, clobbers: dx}
+		gp21hmul  = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
+		gp11mod   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx}, clobbers: ax}
+		gp21mul   = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}
+
+		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags = regInfo{inputs: []regMask{gpsp}}
+		flagsgp  = regInfo{inputs: nil, outputs: gponly}
+
+		readflags = regInfo{inputs: nil, outputs: gponly}
+		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
+
+		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+
+		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
+		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+
+		fp01   = regInfo{inputs: nil, outputs: fponly}
+		fp21   = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp21x7 = regInfo{inputs: []regMask{fp &^ x7, fp &^ x7},
+			clobbers: x7, outputs: []regMask{fp &^ x7}}
+		fpgp     = regInfo{inputs: fponly, outputs: gponly}
+		gpfp     = regInfo{inputs: gponly, outputs: fponly}
+		fp11     = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags = regInfo{inputs: []regMask{fp, fp}}
+
+		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+		fpstore    = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+		fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+	)
+
+	var _386ops = []opData{
+		// fp ops
+		{name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add
+		{name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add
+		{name: "SUBSS", argLength: 2, reg: fp21x7, asm: "SUBSS", resultInArg0: true},                  // fp32 sub
+		{name: "SUBSD", argLength: 2, reg: fp21x7, asm: "SUBSD", resultInArg0: true},                  // fp64 sub
+		{name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul
+		{name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul
+		{name: "DIVSS", argLength: 2, reg: fp21x7, asm: "DIVSS", resultInArg0: true},                  // fp32 div
+		{name: "DIVSD", argLength: 2, reg: fp21x7, asm: "DIVSD", resultInArg0: true},                  // fp64 div
+
+		{name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff"},            // fp32 load
+		{name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff"},            // fp64 load
+		{name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, // fp32 constant
+		{name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, // fp64 constant
+		{name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff"},     // fp32 load indexed by i
+		{name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff"},     // fp32 load indexed by 4*i
+		{name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff"},     // fp64 load indexed by i
+		{name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff"},     // fp64 load indexed by 8*i
+
+		{name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff"},        // fp32 store
+		{name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff"},        // fp64 store
+		{name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff"}, // fp32 indexed by i store
+		{name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff"}, // fp32 indexed by 4i store
+		{name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by i store
+		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by 8i store
+
+		// binary ops
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32", clobberFlags: true}, // arg0 + auxint
+
+		{name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true},                // arg0 + arg1, generates <carry,result> pair
+		{name: "ADDLconstcarry", argLength: 1, reg: gp11carry, asm: "ADDL", aux: "Int32", resultInArg0: true},                // arg0 + auxint, generates <carry,result> pair
+		{name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0+arg1+carry(arg2), where arg2 is flags
+		{name: "ADCLconst", argLength: 2, reg: gp1carry1, asm: "ADCL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0+auxint+carry(arg1), where arg1 is flags
+
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+
+		{name: "SUBLcarry", argLength: 2, reg: gp21carry, asm: "SUBL", resultInArg0: true},                                   // arg0-arg1, generates <borrow,result> pair
+		{name: "SUBLconstcarry", argLength: 1, reg: gp11carry, asm: "SUBL", aux: "Int32", resultInArg0: true},                // arg0-auxint, generates <borrow,result> pair
+		{name: "SBBL", argLength: 3, reg: gp2carry1, asm: "SBBL", resultInArg0: true, clobberFlags: true},                    // arg0-arg1-borrow(arg2), where arg2 is flags
+		{name: "SBBLconst", argLength: 2, reg: gp1carry1, asm: "SBBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0-auxint-borrow(arg1), where arg1 is flags
+
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 * auxint
+
+		{name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULW", argLength: 2, reg: gp21hmul, asm: "IMULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULB", argLength: 2, reg: gp21hmul, asm: "IMULB", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULWU", argLength: 2, reg: gp21hmul, asm: "MULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULBU", argLength: 2, reg: gp21hmul, asm: "MULB", clobberFlags: true}, // (arg0 * arg1) >> width
+
+		{name: "MULLQU", argLength: 2, reg: gp21mul, asm: "MULL", clobberFlags: true}, // arg0 * arg1, high 32 in result[0], low 32 in result[1]
+
+		{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW", clobberFlags: true}, // arg0 / arg1
+
+		{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL", clobberFlags: true}, // arg0 % arg1
+		{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW", clobberFlags: true}, // arg0 % arg1
+		{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL", clobberFlags: true}, // arg0 % arg1
+		{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW", clobberFlags: true}, // arg0 % arg1
+
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+
+		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
+		{name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"},  // arg0 compare to auxint
+
+		{name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32
+		{name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64
+
+		{name: "TESTL", argLength: 2, reg: gp2flags, asm: "TESTL", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTW", argLength: 2, reg: gp2flags, asm: "TESTW", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTB", argLength: 2, reg: gp2flags, asm: "TESTB", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
+		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0
+
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 32
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
+		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // unsigned arg0 >> auxint, shift amount 0-31
+
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // signed arg0 >> auxint, shift amount 0-31
+
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // arg0 rotate left auxint, rotate amount 0-7
+
+		// unary ops
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
+
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true, clobberFlags: true}, // ^arg0
+
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+
+		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+
+		{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+		// Note: SBBW and SBBB are subsumed by SBBL
+
+		{name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+		{name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+		{name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"},  // extract signed < condition from arg0
+		{name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+		{name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"},  // extract signed > condition from arg0
+		{name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+		{name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"},  // extract unsigned < condition from arg0
+		{name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+		{name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"},  // extract unsigned > condition from arg0
+		{name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+		// Need different opcodes for floating point conditions because
+		// any comparison involving a NaN is always FALSE and thus
+		// the patterns for inverting conditions cannot be used.
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},                       // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},                       // extract "unordered" (Nan present) condition from arg0
+
+		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
+		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+		{name: "MOVBLSX", argLength: 1, reg: gp11, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32
+		{name: "MOVBLZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32
+		{name: "MOVWLSX", argLength: 1, reg: gp11, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32
+		{name: "MOVWLZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32
+
+		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+
+		{name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
+		{name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
+		{name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"},   // convert int32 to float32
+		{name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"},   // convert int32 to float64
+		{name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"},   // convert float64 to float32
+		{name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"},   // convert float32 to float64
+
+		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+		{name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+		{name: "LEAL1", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + arg1 + auxint + aux
+		{name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 8*arg1 + auxint + aux
+		// Note: LEAL{1,2,4,8} must not have OpSB as either argument.
+
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff"},             // ditto, sign extend to int32
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16"}, // load 2 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff"},             // ditto, sign extend to int32
+		{name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32"},    // load 4 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"},     // store byte in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"},     // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"},     // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+		// indexed loads/stores
+		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx1", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff"}, // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx1", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff"},    // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff"},    // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
+		// TODO: sign-extending indexed loads
+		{name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVB", aux: "SymOff"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+		// TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
+
+		// For storeconst ops, the AuxInt field encodes both
+		// the value to store and an address offset of the store.
+		// Cast AuxInt to a ValAndOff to extract Val and Off fields.
+		{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux.  arg1=mem
+		{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ...
+		{name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ...
+
+		{name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux.  arg2=mem
+		{name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ... arg1 ...
+		{name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ... 2*arg1 ...
+		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ...
+		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ...
+
+		// arg0 = pointer to start of memory to zero
+		// arg1 = value to store (will always be zero)
+		// arg2 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+				// Note: CX is only clobbered when dynamic linking.
+			},
+		},
+
+		// arg0 = address of memory to zero
+		// arg1 = # of 4-byte words to zero
+		// arg2 = value to store (will always be zero)
+		// arg3 = mem
+		// returns mem
+		{
+			name:      "REPSTOSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+			},
+		},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                             // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                               // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                  // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = mem
+		// auxint = offset from duffcopy symbol to call
+		// returns memory
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
+				clobbers: buildReg("DI SI CX"), // uses CX as a temporary
+			},
+			clobberFlags: true,
+		},
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = # of 8-byte words to copy
+		// arg3 = mem
+		// returns memory
+		{
+			name:      "REPMOVSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+				clobbers: buildReg("DI SI CX"),
+			},
+		},
+
+		// (InvertFlags (CMPL a b)) == (CMPL b a)
+		// So if we want (SETL (CMPL a b)) but we can't do that because a is a constant,
+		// then we do (SETL (InvertFlags (CMPL b a))) instead.
+		// Rewrites will convert this to (SETG (CMPL b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Pseudo-ops
+		{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of DX (the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}},
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true},
+
+		// MOVLconvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GC
+		// (particularly stack maps).  It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		// arg0=ptr/int arg1=mem, output=int/ptr
+		{name: "MOVLconvert", argLength: 2, reg: gp11, asm: "MOVL"},
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned <
+		{name: "FlagGT_ULT"}, // signed > and unsigned >
+
+		// Special op for -x on 387
+		{name: "FCHS", argLength: 1, reg: fp11},
+
+		// Special ops for PIC floating-point constants.
+		// MOVSXconst1 loads the address of the constant-pool entry into a register.
+		// MOVSXconst2 loads the constant from that address.
+		// MOVSXconst1 returns a pointer, but we type it as uint32 because it can never point to the Go heap.
+		{name: "MOVSSconst1", reg: gp01, typ: "UInt32", aux: "Float32"},
+		{name: "MOVSDconst1", reg: gp01, typ: "UInt32", aux: "Float64"},
+		{name: "MOVSSconst2", argLength: 1, reg: gpfp, asm: "MOVSS"},
+		{name: "MOVSDconst2", argLength: 1, reg: gpfp, asm: "MOVSD"},
+	}
+
+	var _386blocks = []blockData{
+		{name: "EQ"},
+		{name: "NE"},
+		{name: "LT"},
+		{name: "LE"},
+		{name: "GT"},
+		{name: "GE"},
+		{name: "ULT"},
+		{name: "ULE"},
+		{name: "UGT"},
+		{name: "UGE"},
+		{name: "EQF"},
+		{name: "NEF"},
+		{name: "ORD"}, // FP, ordered comparison (parity zero)
+		{name: "NAN"}, // FP, unordered comparison (parity one)
+	}
+
+	archs = append(archs, arch{
+		name:            "386",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../x86/ssa.go",
+		ops:             _386ops,
+		blocks:          _386blocks,
+		regnames:        regNames386,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["BP"]),
+	})
+}
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -4,7 +4,8 @@

 // Lowering arithmetic
 (Add64  x y) -> (ADDQ  x y)
-(AddPtr x y) -> (ADDQ  x y)
+(AddPtr x y) && config.PtrSize == 8 -> (ADDQ x y)
+(AddPtr x y) && config.PtrSize == 4 -> (ADDL x y)
 (Add32  x y) -> (ADDL  x y)
 (Add16  x y) -> (ADDL  x y)
 (Add8   x y) -> (ADDL  x y)
@@ -12,7 +13,8 @@
 (Add64F x y) -> (ADDSD x y)

 (Sub64  x y) -> (SUBQ  x y)
-(SubPtr x y) -> (SUBQ  x y)
+(SubPtr x y) && config.PtrSize == 8 -> (SUBQ x y)
+(SubPtr x y) && config.PtrSize == 4 -> (SUBL x y)
 (Sub32  x y) -> (SUBL  x y)
 (Sub16  x y) -> (SUBL  x y)
 (Sub8   x y) -> (SUBL  x y)
@@ -29,14 +31,14 @@
 (Div32F x y) -> (DIVSS x y)
 (Div64F x y) -> (DIVSD x y)

-(Div64  x y) -> (DIVQ  x y)
-(Div64u x y) -> (DIVQU x y)
-(Div32  x y) -> (DIVL  x y)
-(Div32u x y) -> (DIVLU x y)
-(Div16  x y) -> (DIVW  x y)
-(Div16u x y) -> (DIVWU x y)
-(Div8   x y) -> (DIVW  (SignExt8to16 x) (SignExt8to16 y))
-(Div8u  x y) -> (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+(Div64  x y) -> (Select0 (DIVQ  x y))
+(Div64u x y) -> (Select0 (DIVQU x y))
+(Div32  x y) -> (Select0 (DIVL  x y))
+(Div32u x y) -> (Select0 (DIVLU x y))
+(Div16  x y) -> (Select0 (DIVW  x y))
+(Div16u x y) -> (Select0 (DIVWU x y))
+(Div8   x y) -> (Select0 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Div8u  x y) -> (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))

 (Hmul64  x y) -> (HMULQ  x y)
 (Hmul64u x y) -> (HMULQU x y)
@@ -49,14 +51,14 @@

 (Avg64u x y) -> (AVGQU x y)

-(Mod64  x y) -> (MODQ  x y)
-(Mod64u x y) -> (MODQU x y)
-(Mod32  x y) -> (MODL  x y)
-(Mod32u x y) -> (MODLU x y)
-(Mod16  x y) -> (MODW  x y)
-(Mod16u x y) -> (MODWU x y)
-(Mod8   x y) -> (MODW  (SignExt8to16 x) (SignExt8to16 y))
-(Mod8u  x y) -> (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+(Mod64  x y) -> (Select1 (DIVQ  x y))
+(Mod64u x y) -> (Select1 (DIVQU x y))
+(Mod32  x y) -> (Select1 (DIVL  x y))
+(Mod32u x y) -> (Select1 (DIVLU x y))
+(Mod16  x y) -> (Select1 (DIVW  x y))
+(Mod16u x y) -> (Select1 (DIVWU x y))
+(Mod8   x y) -> (Select1 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Mod8u  x y) -> (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))

 (And64 x y) -> (ANDQ x y)
 (And32 x y) -> (ANDL x y)
@@ -91,8 +93,9 @@
 (Not x) -> (XORLconst [1] x)

 // Lowering pointer arithmetic
-(OffPtr [off] ptr) && is32Bit(off) -> (ADDQconst [off] ptr)
-(OffPtr [off] ptr) -> (ADDQ (MOVQconst [off]) ptr)
+(OffPtr [off] ptr) && config.PtrSize == 8 && is32Bit(off) -> (ADDQconst [off] ptr)
+(OffPtr [off] ptr) && config.PtrSize == 8 -> (ADDQ (MOVQconst [off]) ptr)
+(OffPtr [off] ptr) && config.PtrSize == 4 -> (ADDLconst [off] ptr)

 // Lowering other arithmetic
 // TODO: CMPQconst 0 below is redundant because BSF sets Z but how to remove?
@@ -270,7 +273,8 @@
 (Eq16  x y) -> (SETEQ (CMPW x y))
 (Eq8   x y) -> (SETEQ (CMPB x y))
 (EqB   x y) -> (SETEQ (CMPB x y))
-(EqPtr x y) -> (SETEQ (CMPQ x y))
+(EqPtr x y) && config.PtrSize == 8 -> (SETEQ (CMPQ x y))
+(EqPtr x y) && config.PtrSize == 4 -> (SETEQ (CMPL x y))
 (Eq64F x y) -> (SETEQF (UCOMISD x y))
 (Eq32F x y) -> (SETEQF (UCOMISS x y))

@@ -279,13 +283,16 @@
 (Neq16  x y) -> (SETNE (CMPW x y))
 (Neq8   x y) -> (SETNE (CMPB x y))
 (NeqB   x y) -> (SETNE (CMPB x y))
-(NeqPtr x y) -> (SETNE (CMPQ x y))
+(NeqPtr x y) && config.PtrSize == 8 -> (SETNE (CMPQ x y))
+(NeqPtr x y) && config.PtrSize == 4 -> (SETNE (CMPL x y))
 (Neq64F x y) -> (SETNEF (UCOMISD x y))
 (Neq32F x y) -> (SETNEF (UCOMISS x y))

+(Int64Hi x) -> (SHRQconst [32] x) // needed for amd64p32
+
 // Lowering loads
-(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVQload ptr mem)
-(Load <t> ptr mem) && is32BitInt(t) -> (MOVLload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t) && config.PtrSize == 8) -> (MOVQload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t) && config.PtrSize == 4) -> (MOVLload ptr mem)
 (Load <t> ptr mem) && is16BitInt(t) -> (MOVWload ptr mem)
 (Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) -> (MOVBload ptr mem)
 (Load <t> ptr mem) && is32BitFloat(t) -> (MOVSSload ptr mem)
@@ -302,39 +309,47 @@
 (Store [1] ptr val mem) -> (MOVBstore ptr val mem)

 // Lowering moves
-(Move [0] _ _ mem) -> mem
-(Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
-(Move [2] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
-(Move [4] dst src mem) -> (MOVLstore dst (MOVLload src mem) mem)
-(Move [8] dst src mem) -> (MOVQstore dst (MOVQload src mem) mem)
-(Move [16] dst src mem) -> (MOVOstore dst (MOVOload src mem) mem)
-(Move [3] dst src mem) ->
+(Move [s] _ _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 -> (MOVQstore dst (MOVQload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 -> (MOVOstore dst (MOVOload src mem) mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
-(Move [5] dst src mem) ->
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 5 ->
 	(MOVBstore [4] dst (MOVBload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [6] dst src mem) ->
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 6 ->
 	(MOVWstore [4] dst (MOVWload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [7] dst src mem) ->
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 ->
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [size] dst src mem) && size > 8 && size < 16 ->
-	(MOVQstore [size-8] dst (MOVQload [size-8] src mem)
+(Move [s] dst src mem) && SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() < 16 ->
+	(MOVQstore [SizeAndAlign(s).Size()-8] dst (MOVQload [SizeAndAlign(s).Size()-8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

 // Adjust moves to be a multiple of 16 bytes.
-(Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 <= 8 ->
-	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size()%16 != 0 && SizeAndAlign(s).Size()%16 <= 8 ->
+	(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%16]
+		(OffPtr <dst.Type> dst [SizeAndAlign(s).Size()%16])
+		(OffPtr <src.Type> src [SizeAndAlign(s).Size()%16])
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 > 8 ->
-	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size()%16 != 0 && SizeAndAlign(s).Size()%16 > 8 ->
+	(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%16]
+		(OffPtr <dst.Type> dst [SizeAndAlign(s).Size()%16])
+		(OffPtr <src.Type> src [SizeAndAlign(s).Size()%16])
 		(MOVOstore dst (MOVOload src mem) mem))

 // Medium copying uses a duff device.
-(Move [size] dst src mem) && size >= 32 && size <= 16*64 && size%16 == 0 && !config.noDuffDevice ->
-	(DUFFCOPY [14*(64-size/16)] dst src mem)
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0
+	&& !config.noDuffDevice ->
+	(DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem)
 // 14 and 64 are magic constants.  14 is the number of bytes to encode:
 //	MOVUPS	(SI), X0
 //	ADDQ	$16, SI
@@ -343,57 +358,62 @@
 // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.

 // Large copying uses REP MOVSQ.
-(Move [size] dst src mem) && (size > 16*64 || config.noDuffDevice) && size%8 == 0 ->
-	(REPMOVSQ dst src (MOVQconst [size/8]) mem)
+(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 ->
+	(REPMOVSQ dst src (MOVQconst [SizeAndAlign(s).Size()/8]) mem)

 // Lowering Zero instructions
-(Zero [0] _ mem) -> mem
-(Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
-(Zero [2] destptr mem) -> (MOVWstoreconst [0] destptr mem)
-(Zero [4] destptr mem) -> (MOVLstoreconst [0] destptr mem)
-(Zero [8] destptr mem) -> (MOVQstoreconst [0] destptr mem)
+(Zero [s] _ mem) && SizeAndAlign(s).Size() == 0 -> mem
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstoreconst [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstoreconst [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstoreconst [0] destptr mem)
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 8 -> (MOVQstoreconst [0] destptr mem)

-(Zero [3] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 3 ->
 	(MOVBstoreconst [makeValAndOff(0,2)] destptr
 		(MOVWstoreconst [0] destptr mem))
-(Zero [5] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 5 ->
 	(MOVBstoreconst [makeValAndOff(0,4)] destptr
 		(MOVLstoreconst [0] destptr mem))
-(Zero [6] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 6 ->
 	(MOVWstoreconst [makeValAndOff(0,4)] destptr
 		(MOVLstoreconst [0] destptr mem))
-(Zero [7] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 7 ->
 	(MOVLstoreconst [makeValAndOff(0,3)] destptr
 		(MOVLstoreconst [0] destptr mem))

 // Strip off any fractional word zeroing.
-(Zero [size] destptr mem) && size%8 != 0 && size > 8 ->
-	(Zero [size-size%8] (ADDQconst destptr [size%8])
+(Zero [s] destptr mem) && SizeAndAlign(s).Size()%8 != 0 && SizeAndAlign(s).Size() > 8 ->
+	(Zero [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%8] (OffPtr <destptr.Type> destptr [SizeAndAlign(s).Size()%8])
 		(MOVQstoreconst [0] destptr mem))

 // Zero small numbers of words directly.
-(Zero [16] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 16 ->
 	(MOVQstoreconst [makeValAndOff(0,8)] destptr
 		(MOVQstoreconst [0] destptr mem))
-(Zero [24] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 24 ->
 	(MOVQstoreconst [makeValAndOff(0,16)] destptr
 		(MOVQstoreconst [makeValAndOff(0,8)] destptr
 			(MOVQstoreconst [0] destptr mem)))
-(Zero [32] destptr mem) ->
+(Zero [s] destptr mem) && SizeAndAlign(s).Size() == 32 ->
 	(MOVQstoreconst [makeValAndOff(0,24)] destptr
 		(MOVQstoreconst [makeValAndOff(0,16)] destptr
 			(MOVQstoreconst [makeValAndOff(0,8)] destptr
 				(MOVQstoreconst [0] destptr mem))))

 // Medium zeroing uses a duff device.
-(Zero [size] destptr mem) && size <= 1024 && size%8 == 0 && size%16 != 0 && !config.noDuffDevice ->
-	(Zero [size-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
-(Zero [size] destptr mem) && size <= 1024 && size%16 == 0 && !config.noDuffDevice ->
-	(DUFFZERO [duffStart(size)] (ADDQconst [duffAdj(size)] destptr) (MOVOconst [0]) mem)
+(Zero [s] destptr mem)
+	&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%8 == 0 && SizeAndAlign(s).Size()%16 != 0
+	&& !config.noDuffDevice ->
+	(Zero [SizeAndAlign(s).Size()-8] (OffPtr <destptr.Type> [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
+(Zero [s] destptr mem)
+	&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice ->
+	(DUFFZERO [SizeAndAlign(s).Size()] destptr (MOVOconst [0]) mem)

 // Large zeroing uses REP STOSQ.
-(Zero [size] destptr mem) && (size > 1024 || (config.noDuffDevice && size > 32)) && size%8 == 0 ->
-	(REPSTOSQ destptr (MOVQconst [size/8]) (MOVQconst [0]) mem)
+(Zero [s] destptr mem)
+	&& (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32))
+	&& SizeAndAlign(s).Size()%8 == 0 ->
+	(REPSTOSQ destptr (MOVQconst [SizeAndAlign(s).Size()/8]) (MOVQconst [0]) mem)

 // Lowering constants
 (Const8   [val]) -> (MOVLconst [val])
@@ -402,7 +422,8 @@
 (Const64  [val]) -> (MOVQconst [val])
 (Const32F [val]) -> (MOVSSconst [val])
 (Const64F [val]) -> (MOVSDconst [val])
-(ConstNil) -> (MOVQconst [0])
+(ConstNil) && config.PtrSize == 8 -> (MOVQconst [0])
+(ConstNil) && config.PtrSize == 4 -> (MOVLconst [0])
 (ConstBool [b]) -> (MOVLconst [b])

 // Lowering calls
@@ -413,15 +434,17 @@
 (InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)

 // Miscellaneous
-(Convert <t> x mem) -> (MOVQconvert <t> x mem)
-(IsNonNil p) -> (SETNE (TESTQ p p))
+(Convert <t> x mem) && config.PtrSize == 8 -> (MOVQconvert <t> x mem)
+(Convert <t> x mem) && config.PtrSize == 4 -> (MOVLconvert <t> x mem)
+(IsNonNil p) && config.PtrSize == 8 -> (SETNE (TESTQ p p))
+(IsNonNil p) && config.PtrSize == 4 -> (SETNE (TESTL p p))
 (IsInBounds idx len) -> (SETB (CMPQ idx len))
 (IsSliceInBounds idx len) -> (SETBE (CMPQ idx len))
 (NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
 (GetG mem) -> (LoweredGetG mem)
 (GetClosurePtr) -> (LoweredGetClosurePtr)
-(Addr {sym} base) -> (LEAQ {sym} base)
-(ITab (Load ptr mem)) -> (MOVQload ptr mem)
+(Addr {sym} base) && config.PtrSize == 8 -> (LEAQ {sym} base)
+(Addr {sym} base) && config.PtrSize == 4 -> (LEAL {sym} base)

 // block rewrites
 (If (SETL  cmp) yes no) -> (LT  cmp yes no)
@@ -495,6 +518,12 @@
 (ANDLconst [c] (ANDLconst [d] x)) -> (ANDLconst [c & d] x)
 (ANDQconst [c] (ANDQconst [d] x)) -> (ANDQconst [c & d] x)

+(XORLconst [c] (XORLconst [d] x)) -> (XORLconst [c ^ d] x)
+(XORQconst [c] (XORQconst [d] x)) -> (XORQconst [c ^ d] x)
+
+(MULLconst [c] (MULLconst [d] x)) -> (MULLconst [int64(int32(c * d))] x)
+(MULQconst [c] (MULQconst [d] x)) && is32Bit(c*d) -> (MULQconst [c * d] x)
+
 (ORQ x (MOVQconst [c])) && is32Bit(c) -> (ORQconst [c] x)
 (ORQ (MOVQconst [c]) x) && is32Bit(c) -> (ORQconst [c] x)
 (ORL x (MOVLconst [c])) -> (ORLconst [c] x)
@@ -544,6 +573,16 @@
 (SHRL x (ANDLconst [31] y)) -> (SHRL x y)
 (SHRQ x (ANDQconst [63] y)) -> (SHRQ x y)

+(ROLQconst [c] (ROLQconst [d] x)) -> (ROLQconst [(c+d)&63] x)
+(ROLLconst [c] (ROLLconst [d] x)) -> (ROLLconst [(c+d)&31] x)
+(ROLWconst [c] (ROLWconst [d] x)) -> (ROLWconst [(c+d)&15] x)
+(ROLBconst [c] (ROLBconst [d] x)) -> (ROLBconst [(c+d)& 7] x)
+
+(ROLQconst [0] x) -> x
+(ROLLconst [0] x) -> x
+(ROLWconst [0] x) -> x
+(ROLBconst [0] x) -> x
+
 // Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
 // because the x86 instructions are defined to use all 5 bits of the shift even
 // for the small shifts. I don't think we'll ever generate a weird shift (e.g.
@@ -1564,3 +1603,53 @@
  && x.Uses == 1
  && clobber(x)
  -> (MOVQstoreidx1 [i-4] {s} p (SHLQconst <idx.Type> [2] idx) w0 mem)
+
+// amd64p32 rules
+// same as the rules above, but with 32 instead of 64 bit pointer arithmetic.
+// LEAQ,ADDQ -> LEAL,ADDL
+(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
+(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
+
+(MOVQload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVQload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVLload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVLload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
+	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVQstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVQstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVLstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVLstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVWstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVBstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(MOVQstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVQstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVWstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVWstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVBstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
+	(MOVBstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+(MOVQload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVQload  [off1+off2] {sym} ptr mem)
+(MOVLload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVLload  [off1+off2] {sym} ptr mem)
+(MOVWload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVWload  [off1+off2] {sym} ptr mem)
+(MOVBload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload  [off1+off2] {sym} ptr mem)
+(MOVQstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVQstore  [off1+off2] {sym} ptr val mem)
+(MOVLstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore  [off1+off2] {sym} ptr val mem)
+(MOVWstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore  [off1+off2] {sym} ptr val mem)
+(MOVBstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore  [off1+off2] {sym} ptr val mem)
+(MOVQstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVQstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOVLstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVLstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOVWstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVWstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOVBstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
+	(MOVBstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -64,7 +64,6 @@ var regNamesAMD64 = []string{

 	// pseudo-registers
 	"SB",
-	"FLAGS",
 }

 func init() {
@@ -98,43 +97,36 @@ func init() {
 		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15")
 		gpsp       = gp | buildReg("SP")
 		gpspsb     = gpsp | buildReg("SB")
-		flags      = buildReg("FLAGS")
-		callerSave = gp | fp | flags
+		callerSave = gp | fp
 	)
 	// Common slices of register masks
 	var (
-		gponly    = []regMask{gp}
-		fponly    = []regMask{fp}
-		flagsonly = []regMask{flags}
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
 	)

 	// Common regInfo
 	var (
-		gp01      = regInfo{inputs: []regMask{}, outputs: gponly}
-		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly, clobbers: flags}
-		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly, clobbers: flags}
-		gp11nf    = regInfo{inputs: []regMask{gpsp}, outputs: gponly} // nf: no flags clobbered
+		gp01      = regInfo{inputs: nil, outputs: gponly}
+		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
 		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
-		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly, clobbers: flags}
-		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly, clobbers: flags}
+		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
 		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
-		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}, clobbers: flags}
-		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax},
-			clobbers: dx | flags}
-		gp11hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx},
-			clobbers: ax | flags}
-		gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx},
-			clobbers: ax | flags}
+		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
+		gp21hmul  = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}

-		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: flagsonly}
-		gp1flags = regInfo{inputs: []regMask{gpsp}, outputs: flagsonly}
-		flagsgp  = regInfo{inputs: flagsonly, outputs: gponly}
+		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags = regInfo{inputs: []regMask{gpsp}}
+		flagsgp  = regInfo{inputs: nil, outputs: gponly}

 		// for CMOVconst -- uses AX to hold constant temporary.
-		gp1flagsgp = regInfo{inputs: []regMask{gp &^ ax, flags}, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
+		gp1flagsgp = regInfo{inputs: []regMask{gp &^ ax}, clobbers: ax, outputs: []regMask{gp &^ ax}}

-		readflags = regInfo{inputs: flagsonly, outputs: gponly}
-		flagsgpax = regInfo{inputs: flagsonly, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
+		readflags = regInfo{inputs: nil, outputs: gponly}
+		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}

 		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
 		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
@@ -144,14 +136,14 @@ func init() {
 		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
 		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}

-		fp01    = regInfo{inputs: []regMask{}, outputs: fponly}
+		fp01    = regInfo{inputs: nil, outputs: fponly}
 		fp21    = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
 		fp21x15 = regInfo{inputs: []regMask{fp &^ x15, fp &^ x15},
 			clobbers: x15, outputs: []regMask{fp &^ x15}}
 		fpgp     = regInfo{inputs: fponly, outputs: gponly}
 		gpfp     = regInfo{inputs: gponly, outputs: fponly}
 		fp11     = regInfo{inputs: fponly, outputs: fponly}
-		fp2flags = regInfo{inputs: []regMask{fp, fp}, outputs: flagsonly}
+		fp2flags = regInfo{inputs: []regMask{fp, fp}}

 		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
 		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
@@ -188,60 +180,53 @@ func init() {
 		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by 8i store

 		// binary ops
-		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true},                // arg0 + arg1
-		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true},                // arg0 + arg1
-		{name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int64", typ: "UInt64"}, // arg0 + auxint
-		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32"},                // arg0 + auxint
+		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int64", typ: "UInt64", clobberFlags: true}, // arg0 + auxint
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", clobberFlags: true},                // arg0 + auxint

-		{name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true},                    // arg0 - arg1
-		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true},                    // arg0 - arg1
-		{name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int64", resultInArg0: true}, // arg0 - auxint
-		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0 - auxint
+		{name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint

-		{name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true}, // arg0 * arg1
-		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true}, // arg0 * arg1
-		{name: "MULQconst", argLength: 1, reg: gp11, asm: "IMULQ", aux: "Int64", resultInArg0: true}, // arg0 * auxint
-		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true}, // arg0 * auxint
+		{name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULQconst", argLength: 1, reg: gp11, asm: "IMULQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 * auxint
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 * auxint

-		{name: "HMULQ", argLength: 2, reg: gp11hmul, asm: "IMULQ"}, // (arg0 * arg1) >> width
-		{name: "HMULL", argLength: 2, reg: gp11hmul, asm: "IMULL"}, // (arg0 * arg1) >> width
-		{name: "HMULW", argLength: 2, reg: gp11hmul, asm: "IMULW"}, // (arg0 * arg1) >> width
-		{name: "HMULB", argLength: 2, reg: gp11hmul, asm: "IMULB"}, // (arg0 * arg1) >> width
-		{name: "HMULQU", argLength: 2, reg: gp11hmul, asm: "MULQ"}, // (arg0 * arg1) >> width
-		{name: "HMULLU", argLength: 2, reg: gp11hmul, asm: "MULL"}, // (arg0 * arg1) >> width
-		{name: "HMULWU", argLength: 2, reg: gp11hmul, asm: "MULW"}, // (arg0 * arg1) >> width
-		{name: "HMULBU", argLength: 2, reg: gp11hmul, asm: "MULB"}, // (arg0 * arg1) >> width
+		{name: "HMULQ", argLength: 2, reg: gp21hmul, asm: "IMULQ", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULW", argLength: 2, reg: gp21hmul, asm: "IMULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULB", argLength: 2, reg: gp21hmul, asm: "IMULB", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULQU", argLength: 2, reg: gp21hmul, asm: "MULQ", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULWU", argLength: 2, reg: gp21hmul, asm: "MULW", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULBU", argLength: 2, reg: gp21hmul, asm: "MULB", clobberFlags: true}, // (arg0 * arg1) >> width

-		{name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true}, // (arg0 + arg1) / 2 as unsigned, all 64 result bits
+		{name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 64 result bits

-		{name: "DIVQ", argLength: 2, reg: gp11div, asm: "IDIVQ"}, // arg0 / arg1
-		{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL"}, // arg0 / arg1
-		{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW"}, // arg0 / arg1
-		{name: "DIVQU", argLength: 2, reg: gp11div, asm: "DIVQ"}, // arg0 / arg1
-		{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL"}, // arg0 / arg1
-		{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW"}, // arg0 / arg1
+		{name: "DIVQ", argLength: 2, reg: gp11div, typ: "(Int64,Int64)", asm: "IDIVQ", clobberFlags: true},   // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVL", argLength: 2, reg: gp11div, typ: "(Int32,Int32)", asm: "IDIVL", clobberFlags: true},   // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVW", argLength: 2, reg: gp11div, typ: "(Int16,Int16)", asm: "IDIVW", clobberFlags: true},   // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVQU", argLength: 2, reg: gp11div, typ: "(UInt64,UInt64)", asm: "DIVQ", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVLU", argLength: 2, reg: gp11div, typ: "(UInt32,UInt32)", asm: "DIVL", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVWU", argLength: 2, reg: gp11div, typ: "(UInt16,UInt16)", asm: "DIVW", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]

-		{name: "MODQ", argLength: 2, reg: gp11mod, asm: "IDIVQ"}, // arg0 % arg1
-		{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL"}, // arg0 % arg1
-		{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW"}, // arg0 % arg1
-		{name: "MODQU", argLength: 2, reg: gp11mod, asm: "DIVQ"}, // arg0 % arg1
-		{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL"}, // arg0 % arg1
-		{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW"}, // arg0 % arg1
+		{name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint

-		{name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true}, // arg0 & arg1
-		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true}, // arg0 & arg1
-		{name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int64", resultInArg0: true}, // arg0 & auxint
-		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true}, // arg0 & auxint
+		{name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint

-		{name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true}, // arg0 | arg1
-		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true}, // arg0 | arg1
-		{name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int64", resultInArg0: true}, // arg0 | auxint
-		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true}, // arg0 | auxint
-
-		{name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true}, // arg0 ^ arg1
-		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true}, // arg0 ^ arg1
-		{name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int64", resultInArg0: true}, // arg0 ^ auxint
-		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true}, // arg0 ^ auxint
+		{name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint

 		{name: "CMPQ", argLength: 2, reg: gp2flags, asm: "CMPQ", typ: "Flags"},                    // arg0 compare to arg1
 		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
@@ -264,60 +249,60 @@ func init() {
 		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
 		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0

-		{name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true},               // arg0 << arg1, shift amount is mod 64
-		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true},               // arg0 << arg1, shift amount is mod 32
-		{name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int64", resultInArg0: true}, // arg0 << auxint, shift amount 0-63
-		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true}, // arg0 << auxint, shift amount 0-31
+		{name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 64
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 32
+		{name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-63
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
 		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!

-		{name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 64
-		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
-		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
-		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
-		{name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int64", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-63
-		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-31
-		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-31
-		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true},  // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 64
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-63
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // unsigned arg0 >> auxint, shift amount 0-31

-		{name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 64
-		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
-		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
-		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
-		{name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int64", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-63
-		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-31
-		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-31
-		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true},  // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 64
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // signed arg0 >> auxint, shift amount 0-31

-		{name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int64", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-63
-		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-31
-		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-15
-		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true},  // arg0 rotate left auxint, rotate amount 0-7
+		{name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-63
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // arg0 rotate left auxint, rotate amount 0-7

 		// unary ops
-		{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true}, // -arg0
-		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true}, // -arg0
+		{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true}, // -arg0
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0

-		{name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true}, // ^arg0
-		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
+		{name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true, clobberFlags: true}, // ^arg0
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true, clobberFlags: true}, // ^arg0

-		{name: "BSFQ", argLength: 1, reg: gp11, asm: "BSFQ"}, // arg0 # of low-order zeroes ; undef if zero
-		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL"}, // arg0 # of low-order zeroes ; undef if zero
-		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW"}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFQ", argLength: 1, reg: gp11, asm: "BSFQ", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero

-		{name: "BSRQ", argLength: 1, reg: gp11, asm: "BSRQ"}, // arg0 # of high-order zeroes ; undef if zero
-		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL"}, // arg0 # of high-order zeroes ; undef if zero
-		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW"}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRQ", argLength: 1, reg: gp11, asm: "BSRQ", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero

 		// Note ASM for ops moves whole register
-		{name: "CMOVQEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQEQ", typ: "UInt64", aux: "Int64", resultInArg0: true}, // replace arg0 w/ constant if Z set
-		{name: "CMOVLEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt32", aux: "Int32", resultInArg0: true}, // replace arg0 w/ constant if Z set
-		{name: "CMOVWEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt16", aux: "Int16", resultInArg0: true}, // replace arg0 w/ constant if Z set
-		{name: "CMOVQNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQNE", typ: "UInt64", aux: "Int64", resultInArg0: true}, // replace arg0 w/ constant if Z not set
-		{name: "CMOVLNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt32", aux: "Int32", resultInArg0: true}, // replace arg0 w/ constant if Z not set
-		{name: "CMOVWNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt16", aux: "Int16", resultInArg0: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVQEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQEQ", typ: "UInt64", aux: "Int64", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVLEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt32", aux: "Int32", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVWEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt16", aux: "Int16", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVQNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQNE", typ: "UInt64", aux: "Int64", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVLNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt32", aux: "Int32", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVWNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt16", aux: "Int16", resultInArg0: true, clobberFlags: true}, // replace arg0 w/ constant if Z not set

-		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true}, // arg0 swap bytes
-		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes
+		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes

 		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)

@@ -338,20 +323,20 @@ func init() {
 		// Need different opcodes for floating point conditions because
 		// any comparison involving a NaN is always FALSE and thus
 		// the patterns for inverting conditions cannot be used.
-		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ"}, // extract == condition from arg0
-		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE"}, // extract != condition from arg0
-		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},   // extract "ordered" (No Nan present) condition from arg0
-		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},   // extract "unordered" (Nan present) condition from arg0
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},                       // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},                       // extract "unordered" (Nan present) condition from arg0

 		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
 		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0

-		{name: "MOVBQSX", argLength: 1, reg: gp11nf, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64
-		{name: "MOVBQZX", argLength: 1, reg: gp11nf, asm: "MOVBQZX"}, // zero extend arg0 from int8 to int64
-		{name: "MOVWQSX", argLength: 1, reg: gp11nf, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64
-		{name: "MOVWQZX", argLength: 1, reg: gp11nf, asm: "MOVWQZX"}, // zero extend arg0 from int16 to int64
-		{name: "MOVLQSX", argLength: 1, reg: gp11nf, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64
-		{name: "MOVLQZX", argLength: 1, reg: gp11nf, asm: "MOVLQZX"}, // zero extend arg0 from int32 to int64
+		{name: "MOVBQSX", argLength: 1, reg: gp11, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64
+		{name: "MOVBQZX", argLength: 1, reg: gp11, asm: "MOVBQZX"}, // zero extend arg0 from int8 to int64
+		{name: "MOVWQSX", argLength: 1, reg: gp11, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64
+		{name: "MOVWQZX", argLength: 1, reg: gp11, asm: "MOVWQZX"}, // zero extend arg0 from int16 to int64
+		{name: "MOVLQSX", argLength: 1, reg: gp11, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64
+		{name: "MOVLQZX", argLength: 1, reg: gp11, asm: "MOVLQZX"}, // zero extend arg0 from int32 to int64

 		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
 		{name: "MOVQconst", reg: gp01, asm: "MOVQ", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
@@ -369,13 +354,15 @@ func init() {

 		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.

-		{name: "LEAQ", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
-		{name: "LEAQ1", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + arg1 + auxint + aux
-		{name: "LEAQ2", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 2*arg1 + auxint + aux
-		{name: "LEAQ4", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 4*arg1 + auxint + aux
-		{name: "LEAQ8", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 8*arg1 + auxint + aux
+		{name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+		{name: "LEAQ1", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + arg1 + auxint + aux
+		{name: "LEAQ2", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAQ4", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAQ8", argLength: 2, reg: gp21sb, aux: "SymOff"},                                      // arg0 + 8*arg1 + auxint + aux
 		// Note: LEAQ{1,2,4,8} must not have OpSB as either argument.

+		{name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+
 		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
 		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
 		{name: "MOVBQSXload", argLength: 2, reg: gpload, asm: "MOVBQSX", aux: "SymOff"},             // ditto, sign extend to int64
@@ -425,10 +412,10 @@ func init() {
 		{name: "MOVQstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVQ", aux: "SymValAndOff", typ: "Mem"}, // store 8 bytes of ... arg1 ...
 		{name: "MOVQstoreconstidx8", argLength: 3, reg: gpstoreconstidx, asm: "MOVQ", aux: "SymValAndOff", typ: "Mem"}, // store 8 bytes of ... 8*arg1 ...

-		// arg0 = (duff-adjusted) pointer to start of memory to zero
+		// arg0 = pointer to start of memory to zero
 		// arg1 = value to store (will always be zero)
 		// arg2 = mem
-		// auxint = offset into duffzero code to start executing
+		// auxint = # of bytes to zero
 		// returns mem
 		{
 			name:      "DUFFZERO",
@@ -436,8 +423,9 @@ func init() {
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("DI"), buildReg("X0")},
-				clobbers: buildReg("DI FLAGS"),
+				clobbers: buildReg("DI"),
 			},
+			clobberFlags: true,
 		},
 		{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},

@@ -455,11 +443,11 @@ func init() {
 			},
 		},

-		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff"},                                // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
-		{name: "CALLclosure", argLength: 3, reg: regInfo{[]regMask{gpsp, buildReg("DX"), 0}, callerSave, nil}, aux: "Int64"}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
-		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64"},                                  // call deferproc.  arg0=mem, auxint=argsize, returns mem
-		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64"},                                     // call newproc.  arg0=mem, auxint=argsize, returns mem
-		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64"},           // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff", clobberFlags: true},                                             // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "Int64", clobberFlags: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                               // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64", clobberFlags: true},                                                  // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64", clobberFlags: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem

 		// arg0 = destination pointer
 		// arg1 = source pointer
@@ -472,8 +460,9 @@ func init() {
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
-				clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary
+				clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
 			},
+			clobberFlags: true,
 		},

 		// arg0 = destination pointer
@@ -504,14 +493,15 @@ func init() {
 		// use of DX (the closure pointer)
 		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}},
 		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
-		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}, clobbers: flags}},
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true},

 		// MOVQconvert converts between pointers and integers.
 		// We have a special op for this so as to not confuse GC
 		// (particularly stack maps).  It takes a memory arg so it
 		// gets correctly ordered with respect to GC safepoints.
 		// arg0=ptr/int arg1=mem, output=int/ptr
-		{name: "MOVQconvert", argLength: 2, reg: gp11nf, asm: "MOVQ"},
+		{name: "MOVQconvert", argLength: 2, reg: gp11, asm: "MOVQ"},
+		{name: "MOVLconvert", argLength: 2, reg: gp11, asm: "MOVL"}, // amd64p32 equivalent

 		// Constant flag values. For any comparison, there are 5 possible
 		// outcomes: the three from the signed total order (<,==,>) and the
@@ -545,11 +535,14 @@ func init() {
 	}

 	archs = append(archs, arch{
-		name:     "AMD64",
-		pkg:      "cmd/internal/obj/x86",
-		genfile:  "../../amd64/ssa.go",
-		ops:      AMD64ops,
-		blocks:   AMD64blocks,
-		regnames: regNamesAMD64,
+		name:            "AMD64",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../amd64/ssa.go",
+		ops:             AMD64ops,
+		blocks:          AMD64blocks,
+		regnames:        regNamesAMD64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["BP"]),
 	})
 }
--- a/Show More
+++ b/Show More