x/imagegen: fix image editing support (#13866)

- Fix panic in ollama show for image gen models (safe type assertion) - Add vision capability for Flux2KleinPipeline models at create time - Flatten transparent PNG images onto white background for better results
2026-01-29 07:12:03 +03:00 · 2026-01-23 15:37:17 -08:00
parent 1044b0419a
commit 66831dcf70
4 changed files with 53 additions and 5 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1019,8 +1019,10 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 		}

 		if resp.ModelInfo != nil {
-			arch := resp.ModelInfo["general.architecture"].(string)
+			arch, _ := resp.ModelInfo["general.architecture"].(string)
+			if arch != "" {
 				rows = append(rows, []string{"", "architecture", arch})
+			}

 			var paramStr string
 			if resp.Details.ParameterSize != "" {
@@ -1030,7 +1032,9 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 					paramStr = format.HumanNumber(uint64(f))
 				}
 			}
+			if paramStr != "" {
 				rows = append(rows, []string{"", "parameters", paramStr})
+			}

 			if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
 				if f, ok := v.(float64); ok {
--- a/x/create/client/create.go
+++ b/x/create/client/create.go
@@ -11,6 +11,8 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"os"
+	"path/filepath"

 	"github.com/ollama/ollama/manifest"
 	"github.com/ollama/ollama/progress"
@@ -209,10 +211,23 @@ func newManifestWriter(opts CreateOptions, capabilities []string) create.Manifes
 			return fmt.Errorf("invalid model name: %s", modelName)
 		}

+		// TODO: find a better way to detect image input support
+		// For now, hardcode Flux2KleinPipeline as supporting vision (image input)
+		caps := capabilities
+		modelIndex := filepath.Join(opts.ModelDir, "model_index.json")
+		if data, err := os.ReadFile(modelIndex); err == nil {
+			var cfg struct {
+				ClassName string `json:"_class_name"`
+			}
+			if json.Unmarshal(data, &cfg) == nil && cfg.ClassName == "Flux2KleinPipeline" {
+				caps = append(caps, "vision")
+			}
+		}
+
 		// Create config blob with version requirement
 		configData := model.ConfigV2{
 			ModelFormat:  "safetensors",
-			Capabilities: capabilities,
+			Capabilities: caps,
 			Requires:     MinOllamaVersion,
 		}
 		configJSON, err := json.Marshal(configData)
--- a/x/imagegen/cli.go
+++ b/x/imagegen/cli.go
@@ -532,8 +532,10 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 	var imgs []api.ImageData

 	for _, fp := range filePaths {
-		// Normalize escaped spaces
+		// Normalize shell escapes
 		nfp := strings.ReplaceAll(fp, "\\ ", " ")
+		nfp = strings.ReplaceAll(nfp, "\\(", "(")
+		nfp = strings.ReplaceAll(nfp, "\\)", ")")
 		nfp = strings.ReplaceAll(nfp, "%20", " ")

 		data, err := getImageData(nfp)
--- a/x/imagegen/image.go
+++ b/x/imagegen/image.go
@@ -7,6 +7,8 @@ import (
 	"encoding/base64"
 	"fmt"
 	"image"
+	"image/color"
+	"image/draw"
 	_ "image/jpeg"
 	"image/png"
 	"os"
@@ -111,6 +113,7 @@ func clampF(v, min, max float32) float32 {
 }

 // DecodeImage decodes image bytes with EXIF orientation applied.
+// Transparent images are composited onto a white background.
 func DecodeImage(data []byte) (image.Image, error) {
 	orientation := readJPEGOrientation(data)

@@ -119,9 +122,33 @@ func DecodeImage(data []byte) (image.Image, error) {
 		return nil, err
 	}

+	img = flattenAlpha(img)
 	return applyOrientation(img, orientation), nil
 }

+// flattenAlpha composites an image onto a white background,
+// removing any transparency. This is needed because image
+// generation models don't handle alpha channels well.
+func flattenAlpha(img image.Image) image.Image {
+	if _, ok := img.(*image.RGBA); !ok {
+		if _, ok := img.(*image.NRGBA); !ok {
+			// No alpha channel, return as-is
+			return img
+		}
+	}
+
+	bounds := img.Bounds()
+	dst := image.NewRGBA(bounds)
+
+	// Fill with white background
+	draw.Draw(dst, bounds, &image.Uniform{color.White}, image.Point{}, draw.Src)
+
+	// Composite the image on top
+	draw.Draw(dst, bounds, img, bounds.Min, draw.Over)
+
+	return dst
+}
+
 // readJPEGOrientation extracts EXIF orientation from JPEG bytes.
 // Returns 1 (normal) for non-JPEG or if orientation not found.
 func readJPEGOrientation(data []byte) int {