From 40b7da8422562ec169a6c59250c251cab5482180 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Fri, 22 Mar 2024 19:26:56 -0700
Subject: [PATCH] Speed up fmaf() on x86

---
 libc/tinymath/fmaf.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libc/tinymath/fmaf.c b/libc/tinymath/fmaf.c
index 544301ca5..e48801748 100644
--- a/libc/tinymath/fmaf.c
+++ b/libc/tinymath/fmaf.c
@@ -100,6 +100,16 @@ float fmaf(float x, float y, float z)
 
 #else
 
+#ifdef __x86_64__
+	if (X86_HAVE(FMA)) {
+		asm("vfmadd132ss\t%1,%2,%0" : "+x"(x) : "x"(y), "x"(z));
+		return x;
+	} else if (X86_HAVE(FMA4)) {
+		asm("vfmaddss\t%3,%2,%1,%0" : "=x"(x) : "x"(x), "x"(y), "x"(z));
+		return x;
+	}
+#endif
+
 	/* A double has more than twice as much precision than a float,
 	   so direct double-precision arithmetic suffices, except where
 	   double rounding occurs. */