x/imagegen: fix image editing support (#13866)

- Fix panic in ollama show for image gen models (safe type assertion)
- Add vision capability for Flux2KleinPipeline models at create time
- Flatten transparent PNG images onto white background for better results
This commit is contained in:
Jeffrey Morgan
2026-01-23 15:37:17 -08:00
committed by GitHub
parent 1044b0419a
commit 66831dcf70
4 changed files with 53 additions and 5 deletions

View File

@@ -1019,8 +1019,10 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
}
if resp.ModelInfo != nil {
arch := resp.ModelInfo["general.architecture"].(string)
arch, _ := resp.ModelInfo["general.architecture"].(string)
if arch != "" {
rows = append(rows, []string{"", "architecture", arch})
}
var paramStr string
if resp.Details.ParameterSize != "" {
@@ -1030,7 +1032,9 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
paramStr = format.HumanNumber(uint64(f))
}
}
if paramStr != "" {
rows = append(rows, []string{"", "parameters", paramStr})
}
if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
if f, ok := v.(float64); ok {

View File

@@ -11,6 +11,8 @@ import (
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"github.com/ollama/ollama/manifest"
"github.com/ollama/ollama/progress"
@@ -209,10 +211,23 @@ func newManifestWriter(opts CreateOptions, capabilities []string) create.Manifes
return fmt.Errorf("invalid model name: %s", modelName)
}
// TODO: find a better way to detect image input support
// For now, hardcode Flux2KleinPipeline as supporting vision (image input)
caps := capabilities
modelIndex := filepath.Join(opts.ModelDir, "model_index.json")
if data, err := os.ReadFile(modelIndex); err == nil {
var cfg struct {
ClassName string `json:"_class_name"`
}
if json.Unmarshal(data, &cfg) == nil && cfg.ClassName == "Flux2KleinPipeline" {
caps = append(caps, "vision")
}
}
// Create config blob with version requirement
configData := model.ConfigV2{
ModelFormat: "safetensors",
Capabilities: capabilities,
Capabilities: caps,
Requires: MinOllamaVersion,
}
configJSON, err := json.Marshal(configData)

View File

@@ -532,8 +532,10 @@ func extractFileData(input string) (string, []api.ImageData, error) {
var imgs []api.ImageData
for _, fp := range filePaths {
// Normalize escaped spaces
// Normalize shell escapes
nfp := strings.ReplaceAll(fp, "\\ ", " ")
nfp = strings.ReplaceAll(nfp, "\\(", "(")
nfp = strings.ReplaceAll(nfp, "\\)", ")")
nfp = strings.ReplaceAll(nfp, "%20", " ")
data, err := getImageData(nfp)

View File

@@ -7,6 +7,8 @@ import (
"encoding/base64"
"fmt"
"image"
"image/color"
"image/draw"
_ "image/jpeg"
"image/png"
"os"
@@ -111,6 +113,7 @@ func clampF(v, min, max float32) float32 {
}
// DecodeImage decodes image bytes with EXIF orientation applied.
// Transparent images are composited onto a white background.
func DecodeImage(data []byte) (image.Image, error) {
orientation := readJPEGOrientation(data)
@@ -119,9 +122,33 @@ func DecodeImage(data []byte) (image.Image, error) {
return nil, err
}
img = flattenAlpha(img)
return applyOrientation(img, orientation), nil
}
// flattenAlpha composites an image onto a white background,
// removing any transparency. This is needed because image
// generation models don't handle alpha channels well.
func flattenAlpha(img image.Image) image.Image {
if _, ok := img.(*image.RGBA); !ok {
if _, ok := img.(*image.NRGBA); !ok {
// No alpha channel, return as-is
return img
}
}
bounds := img.Bounds()
dst := image.NewRGBA(bounds)
// Fill with white background
draw.Draw(dst, bounds, &image.Uniform{color.White}, image.Point{}, draw.Src)
// Composite the image on top
draw.Draw(dst, bounds, img, bounds.Min, draw.Over)
return dst
}
// readJPEGOrientation extracts EXIF orientation from JPEG bytes.
// Returns 1 (normal) for non-JPEG or if orientation not found.
func readJPEGOrientation(data []byte) int {