From e830a02d30f04b3297790a1e825b1e8fa8d0f0e8 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Tue, 21 Apr 2026 20:21:43 +0200
Subject: [PATCH 01/23] start gmres draft

---
 src/linalg_iterative/CMakeLists.txt           |   1 +
 .../stdlib_linalg_iterative_solvers.fypp      |  54 +++-
 ...stdlib_linalg_iterative_solvers_gmres.fypp | 278 ++++++++++++++++++
 test/linalg/test_linalg_solve_iterative.fypp  |  43 +++
 4 files changed, 375 insertions(+), 1 deletion(-)
 create mode 100644 src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp

diff --git a/src/linalg_iterative/CMakeLists.txt b/src/linalg_iterative/CMakeLists.txt
index c91188eba..989b1c951 100644
--- a/src/linalg_iterative/CMakeLists.txt
+++ b/src/linalg_iterative/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(linalg_iterative_fppFiles
     stdlib_linalg_iterative_solvers_bicgstab.fypp
     stdlib_linalg_iterative_solvers_cg.fypp
+    stdlib_linalg_iterative_solvers_gmres.fypp
     stdlib_linalg_iterative_solvers.fypp
     stdlib_linalg_iterative_solvers_pcg.fypp
     )
diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
index 6c7725f3f..f3d2a43fe 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
@@ -16,8 +16,9 @@ module stdlib_linalg_iterative_solvers
         enumerator :: stdlib_size_wksp_cg = 3
         enumerator :: stdlib_size_wksp_pcg = 4
         enumerator :: stdlib_size_wksp_bicgstab = 8
+        enumerator :: stdlib_size_wksp_gmres = 3
     end enum
-    public :: stdlib_size_wksp_cg, stdlib_size_wksp_pcg, stdlib_size_wksp_bicgstab
+    public :: stdlib_size_wksp_cg, stdlib_size_wksp_pcg, stdlib_size_wksp_bicgstab, stdlib_size_wksp_gmres
 
     enum, bind(c)
         enumerator :: pc_none = 0
@@ -161,6 +162,27 @@ module stdlib_linalg_iterative_solvers
     end interface
     public :: stdlib_solve_bicgstab_kernel
 
+    !! version: experimental
+    !!
+    !! stdlib_solve_gmres_kernel interface for the restarted generalized minimal residual method.
+    !! [Specifications](../page/specs/stdlib_linalg_iterative_solvers.html#stdlib_solve_gmres_kernel)
+    interface stdlib_solve_gmres_kernel
+        #:for k, t, s in R_KINDS_TYPES
+        module subroutine stdlib_solve_gmres_kernel_${s}$(A,M,b,x,rtol,atol,maxiter,kdim,workspace)
+            class(stdlib_linop_${s}$_type), intent(in) :: A !! linear operator
+            class(stdlib_linop_${s}$_type), intent(in) :: M !! preconditioner linear operator
+            ${t}$, intent(in) :: b(:) !! right-hand side vector
+            ${t}$, intent(inout) :: x(:) !! solution vector and initial guess
+            ${t}$, intent(in) :: rtol !! relative tolerance for convergence
+            ${t}$, intent(in) :: atol !! absolute tolerance for convergence
+            integer, intent(in) :: maxiter !! maximum number of iterations
+            integer, intent(in) :: kdim !! Krylov subspace size before restart
+            type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace !! workspace for the solver
+        end subroutine
+        #:endfor
+    end interface
+    public :: stdlib_solve_gmres_kernel
+
     !! version: experimental
     !!
     !! [Specifications](../page/specs/stdlib_linalg_iterative_solvers.html#stdlib_solve_pcg)
@@ -219,6 +241,36 @@ module stdlib_linalg_iterative_solvers
     end interface
     public :: stdlib_solve_bicgstab
 
+    !! version: experimental
+    !!
+    !! [Specifications](../page/specs/stdlib_linalg_iterative_solvers.html#stdlib_solve_gmres)
+    interface stdlib_solve_gmres
+        #:for matrix in MATRIX_TYPES
+        #:for k, t, s in R_KINDS_TYPES
+        module subroutine stdlib_solve_gmres_${matrix}$_${s}$(A,b,x,di,rtol,atol,maxiter,restart,kdim,precond,M,workspace)
+            !! linear operator matrix
+            #:if matrix == "dense"
+            ${t}$, intent(in) :: A(:,:)
+            #:else
+            type(${matrix}$_${s}$_type), intent(in) :: A
+            #:endif
+            ${t}$, intent(in) :: b(:) !! right-hand side vector
+            ${t}$, intent(inout) :: x(:) !! solution vector and initial guess
+            ${t}$, intent(in), optional :: rtol !! relative tolerance for convergence
+            ${t}$, intent(in), optional :: atol !! absolute tolerance for convergence
+            logical(int8), intent(in), optional, target :: di(:) !! dirichlet conditions mask
+            integer, intent(in), optional :: maxiter !! maximum number of iterations
+            logical, intent(in), optional :: restart !! restart flag
+            integer, intent(in), optional :: kdim !! Krylov subspace size before restart
+            integer, intent(in), optional :: precond !! preconditioner method enumerator
+            class(stdlib_linop_${s}$_type), optional, intent(in), target :: M !! preconditioner linear operator
+            type(stdlib_solver_workspace_${s}$_type), optional, intent(inout), target :: workspace !! workspace for the solver
+        end subroutine
+        #:endfor
+        #:endfor
+    end interface
+    public :: stdlib_solve_gmres
+
 contains
 
     !------------------------------------------------------------------
diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
new file mode 100644
index 000000000..4df8958e3
--- /dev/null
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -0,0 +1,278 @@
+#:include "common.fypp"
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set MATRIX_TYPES = ["dense", "CSR"]
+#:set RANKS = range(1, 2+1)
+
+submodule(stdlib_linalg_iterative_solvers) stdlib_linalg_iterative_gmres
+    use stdlib_kinds
+    use stdlib_sparse
+    use stdlib_constants
+    use stdlib_optval, only: optval
+    implicit none
+
+contains
+
+    #:for k, t, s in R_KINDS_TYPES
+    module subroutine stdlib_solve_gmres_kernel_${s}$(A,M,b,x,rtol,atol,maxiter,kdim,workspace)
+        class(stdlib_linop_${s}$_type), intent(in) :: A
+        class(stdlib_linop_${s}$_type), intent(in) :: M
+        ${t}$, intent(in) :: b(:), rtol, atol
+        ${t}$, intent(inout) :: x(:)
+        integer, intent(in) :: maxiter, kdim
+        type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace
+        integer :: i, iter, inner_iter, j
+        ${t}$ :: beta, denom, hnext, norm_sq, norm_sq0, temp, tolsq
+        ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), x_base(:), y(:)
+
+        allocate(h(kdim+1, kdim), cs(kdim), sn(kdim), g(kdim+1), y(kdim), x_base(size(x)))
+
+        associate( r => workspace%tmp(:,1), &
+                   w => workspace%tmp(:,2), &
+                   v => workspace%tmp(:,3:kdim+3), &
+                   z => workspace%tmp(:,kdim+4:2*kdim+3) )
+
+            norm_sq0 = A%inner_product(b, b)
+            tolsq = max(rtol*rtol*norm_sq0, atol*atol)
+
+            r = b
+            call A%matvec(x, r, alpha=-one_${s}$, beta=one_${s}$, op='N')
+            norm_sq = A%inner_product(r, r)
+            if (associated(workspace%callback)) call workspace%callback(x, norm_sq, 0)
+
+            if (norm_sq <= tolsq .or. maxiter <= 0) then
+                deallocate(h, cs, sn, g, y, x_base)
+                return
+            end if
+
+            iter = 0
+            do while (iter < maxiter .and. norm_sq >= tolsq)
+                beta = sqrt(max(norm_sq, zero_${s}$))
+                if (beta <= epsilon(one_${s}$)) exit
+
+                inner_iter = min(kdim, maxiter - iter)
+                x_base = x
+                h = zero_${s}$
+                cs = zero_${s}$
+                sn = zero_${s}$
+                g = zero_${s}$
+                y = zero_${s}$
+
+                v(:,1) = r / beta
+                g(1) = beta
+
+                do j = 1, inner_iter
+                    call M%matvec(v(:,j), z(:,j), alpha=one_${s}$, beta=zero_${s}$, op='N')
+                    call A%matvec(z(:,j), w, alpha=one_${s}$, beta=zero_${s}$, op='N')
+
+                    do i = 1, j
+                        h(i,j) = A%inner_product(v(:,i), w)
+                        w = w - h(i,j) * v(:,i)
+                    end do
+
+                    hnext = sqrt(max(A%inner_product(w, w), zero_${s}$))
+                    h(j+1,j) = hnext
+                    if (hnext > epsilon(one_${s}$)) then
+                        v(:,j+1) = w / hnext
+                    else
+                        v(:,j+1) = zero_${s}$
+                    end if
+
+                    do i = 1, j - 1
+                        temp = cs(i) * h(i,j) + sn(i) * h(i+1,j)
+                        h(i+1,j) = -sn(i) * h(i,j) + cs(i) * h(i+1,j)
+                        h(i,j) = temp
+                    end do
+
+                    denom = sqrt(h(j,j) * h(j,j) + h(j+1,j) * h(j+1,j))
+                    if (denom > epsilon(one_${s}$)) then
+                        cs(j) = h(j,j) / denom
+                        sn(j) = h(j+1,j) / denom
+                    else
+                        cs(j) = one_${s}$
+                        sn(j) = zero_${s}$
+                    end if
+
+                    temp = cs(j) * h(j,j) + sn(j) * h(j+1,j)
+                    h(j+1,j) = -sn(j) * h(j,j) + cs(j) * h(j+1,j)
+                    h(j,j) = temp
+
+                    temp = cs(j) * g(j) + sn(j) * g(j+1)
+                    g(j+1) = -sn(j) * g(j) + cs(j) * g(j+1)
+                    g(j) = temp
+
+                    call upper_triangular_solve(h, g, y, j)
+                    x = x_base
+                    do i = 1, j
+                        x = x + y(i) * z(:,i)
+                    end do
+
+                    norm_sq = g(j+1) * g(j+1)
+                    iter = iter + 1
+                    if (associated(workspace%callback)) call workspace%callback(x, norm_sq, iter)
+
+                    if (norm_sq < tolsq .or. hnext <= epsilon(one_${s}$) .or. iter >= maxiter) exit
+                end do
+
+                if (norm_sq < tolsq .or. iter >= maxiter) exit
+
+                r = b
+                call A%matvec(x, r, alpha=-one_${s}$, beta=one_${s}$, op='N')
+                norm_sq = A%inner_product(r, r)
+            end do
+        end associate
+
+        deallocate(h, cs, sn, g, y, x_base)
+
+    contains
+
+        subroutine upper_triangular_solve(h, g, y, n)
+            ${t}$, intent(in) :: h(:,:), g(:)
+            ${t}$, intent(inout) :: y(:)
+            integer, intent(in) :: n
+            integer :: row
+
+            y(1:n) = g(1:n)
+            do row = n, 1, -1
+                if (row < n) y(row) = y(row) - sum(h(row,row+1:n) * y(row+1:n))
+                if (abs(h(row,row)) > epsilon(one_${s}$)) then
+                    y(row) = y(row) / h(row,row)
+                else
+                    y(row) = zero_${s}$
+                end if
+            end do
+        end subroutine
+    end subroutine
+    #:endfor
+
+    #:for matrix in MATRIX_TYPES
+    #:for k, t, s in R_KINDS_TYPES
+    module subroutine stdlib_solve_gmres_${matrix}$_${s}$(A,b,x,di,rtol,atol,maxiter,restart,kdim,precond,M,workspace)
+        #:if matrix == "dense"
+        use stdlib_linalg, only: diag
+        ${t}$, intent(in) :: A(:,:)
+        #:else
+        type(${matrix}$_${s}$_type), intent(in) :: A
+        #:endif
+        ${t}$, intent(in) :: b(:)
+        ${t}$, intent(inout) :: x(:)
+        ${t}$, intent(in), optional :: rtol, atol
+        logical(int8), intent(in), optional, target :: di(:)
+        integer, intent(in), optional :: maxiter, kdim
+        logical, intent(in), optional :: restart
+        integer, intent(in), optional :: precond
+        class(stdlib_linop_${s}$_type), optional, intent(in), target :: M
+        type(stdlib_solver_workspace_${s}$_type), optional, intent(inout), target :: workspace
+        type(stdlib_linop_${s}$_type) :: op
+        type(stdlib_linop_${s}$_type), pointer :: M_ => null()
+        type(stdlib_solver_workspace_${s}$_type), pointer :: workspace_
+        integer :: kdim_, maxiter_, n, ncols, precond_
+        ${t}$ :: rtol_, atol_
+        logical :: restart_
+        logical(int8), pointer :: di_(:)
+        ${t}$, allocatable :: diagonal(:)
+
+        n = size(b)
+        maxiter_ = optval(x=maxiter, default=n)
+        kdim_ = max(1, min(optval(x=kdim, default=min(30, n)), n))
+        restart_ = optval(x=restart, default=.true.)
+        rtol_ = optval(x=rtol, default=1.e-5_${s}$)
+        atol_ = optval(x=atol, default=epsilon(one_${s}$))
+        precond_ = optval(x=precond, default=pc_none)
+        ncols = 2 * kdim_ + stdlib_size_wksp_gmres
+
+        if (present(M)) then
+            M_ => M
+        else
+            allocate(M_)
+            allocate(diagonal(n), source=zero_${s}$)
+
+            select case(precond_)
+            case(pc_jacobi)
+                #:if matrix == "dense"
+                diagonal = diag(A)
+                #:else
+                call diag(A, diagonal)
+                #:endif
+                M_%matvec => precond_jacobi
+            case default
+                M_%matvec => precond_none
+            end select
+            where(abs(diagonal) > epsilon(zero_${s}$)) diagonal = one_${s}$ / diagonal
+        end if
+
+        op%matvec => matvec
+
+        if (present(di)) then
+            di_ => di
+        else
+            allocate(di_(n), source=.false._int8)
+        end if
+
+        if (present(workspace)) then
+            workspace_ => workspace
+        else
+            allocate(workspace_)
+        end if
+        if (.not.allocated(workspace_%tmp)) then
+            allocate(workspace_%tmp(n, ncols), source=zero_${s}$)
+        else if (size(workspace_%tmp,1) /= n .or. size(workspace_%tmp,2) < ncols) then
+            deallocate(workspace_%tmp)
+            allocate(workspace_%tmp(n, ncols), source=zero_${s}$)
+        end if
+
+        if (restart_) x = zero_${s}$
+        x = merge(b, x, di_)
+        call stdlib_solve_gmres_kernel(op, M_, b, x, rtol_, atol_, maxiter_, kdim_, workspace_)
+
+        if (.not.present(di)) deallocate(di_)
+        di_ => null()
+
+        if (.not.present(workspace)) then
+            deallocate(workspace_%tmp)
+            deallocate(workspace_)
+        end if
+        M_ => null()
+        workspace_ => null()
+
+    contains
+
+        subroutine matvec(x,y,alpha,beta,op)
+            #:if matrix == "dense"
+            use stdlib_linalg_blas, only: gemv
+            #:endif
+            ${t}$, intent(in) :: x(:)
+            ${t}$, intent(inout) :: y(:)
+            ${t}$, intent(in) :: alpha
+            ${t}$, intent(in) :: beta
+            character(1), intent(in) :: op
+            #:if matrix == "dense"
+            call gemv(op, m=size(A,1), n=size(A,2), alpha=alpha, a=A, lda=size(A,1), x=x, incx=1, beta=beta, y=y, incy=1)
+            #:else
+            call spmv(A, x, y, alpha, beta, op)
+            #:endif
+            y = merge(zero_${s}$, y, di_)
+        end subroutine
+
+        subroutine precond_none(x,y,alpha,beta,op)
+            ${t}$, intent(in) :: x(:)
+            ${t}$, intent(inout) :: y(:)
+            ${t}$, intent(in) :: alpha
+            ${t}$, intent(in) :: beta
+            character(1), intent(in) :: op
+            y = merge(zero_${s}$, x, di_)
+        end subroutine
+
+        subroutine precond_jacobi(x,y,alpha,beta,op)
+            ${t}$, intent(in) :: x(:)
+            ${t}$, intent(inout) :: y(:)
+            ${t}$, intent(in) :: alpha
+            ${t}$, intent(in) :: beta
+            character(1), intent(in) :: op
+            y = merge(zero_${s}$, diagonal * x, di_)
+        end subroutine
+    end subroutine
+    #:endfor
+    #:endfor
+
+end submodule stdlib_linalg_iterative_gmres
\ No newline at end of file
diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 8da08fcb8..13d1e1913 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -23,6 +23,7 @@ module test_linalg_solve_iterative
 
         tests = [ new_unittest("stdlib_solve_cg",test_stdlib_solve_cg), &
                   new_unittest("stdlib_solve_pcg",test_stdlib_solve_pcg), &
+                  new_unittest("stdlib_solve_gmres",test_stdlib_solve_gmres), &
                   new_unittest("stdlib_solve_bicgstab",test_stdlib_solve_bicgstab), &
                   new_unittest("stdlib_solve_bicgstab_nonsymmetric",test_stdlib_solve_bicgstab_nonsymmetric) ]
 
@@ -80,6 +81,48 @@ module test_linalg_solve_iterative
         #:endfor
     end subroutine test_stdlib_solve_pcg
 
+    subroutine test_stdlib_solve_gmres(error)
+        type(error_type), allocatable, intent(out) :: error
+
+        #:for k, t, s in R_KINDS_TYPES
+        block
+        ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
+        ${t}$ :: A(4,4) = reshape([${t}$ :: 5,  1,  2,  0, &
+                                           0,  4, -1,  1, &
+                                           1,  0,  3,  2, &
+                                           2, -1,  0,  6], [4,4])
+        ${t}$ :: x(4), load(4), xref(4)
+
+        xref = [1.0_${k}$, 2.0_${k}$, -1.0_${k}$, 0.5_${k}$]
+        load = matmul(A, xref)
+        x = 0.0_${k}$
+
+        call stdlib_solve_gmres(A, load, x, rtol=1.e-10_${k}$, atol=1.e-12_${k}$, kdim=2, maxiter=12)
+
+        call check(error, norm2(x-xref)<tol*norm2(xref), 'error in GMRES solver (restarted dense test)')
+        if (allocated(error)) return
+        end block
+
+        block
+        ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
+        ${t}$ :: A(3,3) = reshape([${t}$ :: 10,  1,  2, &
+                                            1, 10,  3, &
+                                            2,  3, 10], [3,3])
+        ${t}$ :: x(3), load(3), xref(3)
+
+        xref = [(137._${k}$/218._${k}$), -(9._${k}$/109._${k}$), (87._${k}$/218._${k}$)]
+        load = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
+        x = [0.2_${k}$, -0.1_${k}$, 0.3_${k}$]
+
+        call stdlib_solve_gmres(A, load, x, rtol=1.e-10_${k}$, precond=pc_jacobi)
+
+        call check(error, norm2(x-xref)<tol*norm2(xref), 'error in GMRES solver (Jacobi preconditioned test)')
+        if (allocated(error)) return
+        end block
+
+        #:endfor
+    end subroutine test_stdlib_solve_gmres
+
     subroutine test_stdlib_solve_bicgstab(error)
         type(error_type), allocatable, intent(out) :: error
 

From eb6bfc66f97d87dde3f51ba0e380d249147558db Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 23 Apr 2026 20:55:08 +0200
Subject: [PATCH 02/23] add comments

---
 .../stdlib_linalg_iterative_solvers_gmres.fypp         | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index 4df8958e3..97a52642f 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -32,9 +32,11 @@ contains
                    v => workspace%tmp(:,3:kdim+3), &
                    z => workspace%tmp(:,kdim+4:2*kdim+3) )
 
+            ! Initialize convergence targets from the right-hand side norm.
             norm_sq0 = A%inner_product(b, b)
             tolsq = max(rtol*rtol*norm_sq0, atol*atol)
 
+            ! Form the initial residual and report the starting iterate.
             r = b
             call A%matvec(x, r, alpha=-one_${s}$, beta=one_${s}$, op='N')
             norm_sq = A%inner_product(r, r)
@@ -47,6 +49,7 @@ contains
 
             iter = 0
             do while (iter < maxiter .and. norm_sq >= tolsq)
+                ! Start a new GMRES cycle from the current residual.
                 beta = sqrt(max(norm_sq, zero_${s}$))
                 if (beta <= epsilon(one_${s}$)) exit
 
@@ -58,10 +61,12 @@ contains
                 g = zero_${s}$
                 y = zero_${s}$
 
+                ! Initialize the Krylov basis and least-squares right-hand side.
                 v(:,1) = r / beta
                 g(1) = beta
 
                 do j = 1, inner_iter
+                    ! Run Arnoldi with the preconditioned basis vector.
                     call M%matvec(v(:,j), z(:,j), alpha=one_${s}$, beta=zero_${s}$, op='N')
                     call A%matvec(z(:,j), w, alpha=one_${s}$, beta=zero_${s}$, op='N')
 
@@ -78,12 +83,14 @@ contains
                         v(:,j+1) = zero_${s}$
                     end if
 
+                    ! Apply the previously accumulated Givens rotations.
                     do i = 1, j - 1
                         temp = cs(i) * h(i,j) + sn(i) * h(i+1,j)
                         h(i+1,j) = -sn(i) * h(i,j) + cs(i) * h(i+1,j)
                         h(i,j) = temp
                     end do
 
+                    ! Build and apply the next Givens rotation.
                     denom = sqrt(h(j,j) * h(j,j) + h(j+1,j) * h(j+1,j))
                     if (denom > epsilon(one_${s}$)) then
                         cs(j) = h(j,j) / denom
@@ -101,12 +108,14 @@ contains
                     g(j+1) = -sn(j) * g(j) + cs(j) * g(j+1)
                     g(j) = temp
 
+                    ! Solve the reduced system and update the current iterate.
                     call upper_triangular_solve(h, g, y, j)
                     x = x_base
                     do i = 1, j
                         x = x + y(i) * z(:,i)
                     end do
 
+                    ! Track the residual norm estimate for convergence.
                     norm_sq = g(j+1) * g(j+1)
                     iter = iter + 1
                     if (associated(workspace%callback)) call workspace%callback(x, norm_sq, iter)
@@ -116,6 +125,7 @@ contains
 
                 if (norm_sq < tolsq .or. iter >= maxiter) exit
 
+                ! Refresh the residual before the next restarted cycle.
                 r = b
                 call A%matvec(x, r, alpha=-one_${s}$, beta=one_${s}$, op='N')
                 norm_sq = A%inner_product(r, r)

From 877f4354c7f05e292ac733f82acbd118f0889d31 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 23 Apr 2026 21:10:30 +0200
Subject: [PATCH 03/23] remove kdim from test

---
 test/linalg/test_linalg_solve_iterative.fypp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 13d1e1913..08f0cf526 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -97,8 +97,8 @@ module test_linalg_solve_iterative
         load = matmul(A, xref)
         x = 0.0_${k}$
 
-        call stdlib_solve_gmres(A, load, x, rtol=1.e-10_${k}$, atol=1.e-12_${k}$, kdim=2, maxiter=12)
-
+        call stdlib_solve_gmres(A, load, x, rtol=tol, maxiter=12)
+        
         call check(error, norm2(x-xref)<tol*norm2(xref), 'error in GMRES solver (restarted dense test)')
         if (allocated(error)) return
         end block

From af06e0d257296cbfdfacd5553f2564d2389702f2 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 23 Apr 2026 21:23:38 +0200
Subject: [PATCH 04/23] use dot_product

---
 src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index 97a52642f..d5119e227 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -144,7 +144,7 @@ contains
 
             y(1:n) = g(1:n)
             do row = n, 1, -1
-                if (row < n) y(row) = y(row) - sum(h(row,row+1:n) * y(row+1:n))
+                if (row < n) y(row) = y(row) - dot_product(h(row,row+1:n) , y(row+1:n))
                 if (abs(h(row,row)) > epsilon(one_${s}$)) then
                     y(row) = y(row) / h(row,row)
                 else

From bf59c029f3b370aff412f5f211f16fe42d798307 Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Fri, 8 May 2026 13:31:48 +0200
Subject: [PATCH 05/23] gmres: add Hilbert matrix test

---
 test/linalg/test_linalg_solve_iterative.fypp | 30 +++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 08f0cf526..615be1862 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -24,6 +24,7 @@ module test_linalg_solve_iterative
         tests = [ new_unittest("stdlib_solve_cg",test_stdlib_solve_cg), &
                   new_unittest("stdlib_solve_pcg",test_stdlib_solve_pcg), &
                   new_unittest("stdlib_solve_gmres",test_stdlib_solve_gmres), &
+                  new_unittest("stdlib_solve_gmres_hilbert",test_stdlib_solve_gmres_hilbert), &
                   new_unittest("stdlib_solve_bicgstab",test_stdlib_solve_bicgstab), &
                   new_unittest("stdlib_solve_bicgstab_nonsymmetric",test_stdlib_solve_bicgstab_nonsymmetric) ]
 
@@ -123,6 +124,33 @@ module test_linalg_solve_iterative
         #:endfor
     end subroutine test_stdlib_solve_gmres
 
+    subroutine test_stdlib_solve_gmres_hilbert(error)
+        type(error_type), allocatable, intent(out) :: error
+        integer, parameter :: n = 10
+        integer :: r, c
+        #:for k, t, s in R_KINDS_TYPES
+        block
+            ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
+            ${t}$ :: A(n, n), b(n), x(n), x_ref(n)
+            do r = 1, n
+                do c = 1, n
+                    A(r, c) = 1.0_${k}$ / real(r + c - 1, ${k}$)
+                end do
+            end do
+
+            x_ref = 1.0_${k}$
+            b = matmul(A, x_ref)
+            x = 0.0_${k}$
+
+            call stdlib_solve_gmres(A, b, x, rtol=tol, maxiter=n)
+            
+            call check(error, norm2(x-x_ref) < tol*norm2(x_ref), & 
+            'error in GMRES: Hilbert matrix')
+            if (allocated(error)) return
+        end block
+        #:endfor
+    end subroutine test_stdlib_solve_gmres_hilbert
+
     subroutine test_stdlib_solve_bicgstab(error)
         type(error_type), allocatable, intent(out) :: error
 
@@ -265,4 +293,4 @@ program test_solve_iterative
          write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!"
          error stop
      end if
-end program test_solve_iterative
+end program test_solve_iterative
\ No newline at end of file

From da90ab2627ba9b5a3c725130f448d7ec2a7bc0eb Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Sun, 10 May 2026 18:16:47 +0200
Subject: [PATCH 06/23] linalg: add MGS reorthogonalization to GMRES

---
 ...stdlib_linalg_iterative_solvers_gmres.fypp | 14 +++++++++-----
 test/linalg/test_linalg_solve_iterative.fypp  | 19 +++++++++++++++----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index d5119e227..f9e83c699 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -21,8 +21,8 @@ contains
         ${t}$, intent(inout) :: x(:)
         integer, intent(in) :: maxiter, kdim
         type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace
-        integer :: i, iter, inner_iter, j
-        ${t}$ :: beta, denom, hnext, norm_sq, norm_sq0, temp, tolsq
+        integer :: i, iter, inner_iter, j, k
+        ${t}$ :: beta, denom, hnext, htmp, norm_sq, norm_sq0, temp, tolsq
         ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), x_base(:), y(:)
 
         allocate(h(kdim+1, kdim), cs(kdim), sn(kdim), g(kdim+1), y(kdim), x_base(size(x)))
@@ -70,9 +70,13 @@ contains
                     call M%matvec(v(:,j), z(:,j), alpha=one_${s}$, beta=zero_${s}$, op='N')
                     call A%matvec(z(:,j), w, alpha=one_${s}$, beta=zero_${s}$, op='N')
 
-                    do i = 1, j
-                        h(i,j) = A%inner_product(v(:,i), w)
-                        w = w - h(i,j) * v(:,i)
+                ! Modified Gram Schmidt (MGSR) 
+                    do k = 1, 2 ! reorthogonalization
+                        do i = 1, j
+                            htmp   = A%inner_product(v(:,i), w)
+                            h(i,j) = h(i,j) + htmp
+                            w      = w - htmp*v(:,i)
+                        end do
                     end do
 
                     hnext = sqrt(max(A%inner_product(w, w), zero_${s}$))
diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 615be1862..b0ccccc54 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -125,12 +125,24 @@ module test_linalg_solve_iterative
     end subroutine test_stdlib_solve_gmres
 
     subroutine test_stdlib_solve_gmres_hilbert(error)
+    ! This test uses a Hilbert matrix (n=10) to validate the numerical stability 
+    ! of the GMRES solver. The Hilbert matrix is notoriously ill-conditioned 
+    ! (cond(A) ~ 10^13 for n=10), making it an excellent benchmark for 
+    ! orthogonality maintenance. By setting rtol=0 and maxiter=n, we force 
+    ! the construction of the full Krylov basis, which only converges to 
+    ! the reference solution if the MGS with reorthogonalization 
+    ! preserves the basis' orthogonality.
         type(error_type), allocatable, intent(out) :: error
         integer, parameter :: n = 10
         integer :: r, c
         #:for k, t, s in R_KINDS_TYPES
         block
-            ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
+            #:if k == 'sp'
+            ${t}$, parameter :: tol = 1.e-2_${k}$
+            #:else
+            ${t}$, parameter :: tol = 1.e-5_${k}$
+            #:endif
+            
             ${t}$ :: A(n, n), b(n), x(n), x_ref(n)
             do r = 1, n
                 do c = 1, n
@@ -142,10 +154,9 @@ module test_linalg_solve_iterative
             b = matmul(A, x_ref)
             x = 0.0_${k}$
 
-            call stdlib_solve_gmres(A, b, x, rtol=tol, maxiter=n)
-            
+            call stdlib_solve_gmres(A, b, x, rtol=0.0_${k}$, maxiter=n)
             call check(error, norm2(x-x_ref) < tol*norm2(x_ref), & 
-            'error in GMRES: Hilbert matrix')
+            'error in GMRES ${k}$: Hilbert matrix')
             if (allocated(error)) return
         end block
         #:endfor

From 38048ce947ef0076e045320754f7132992ee12f8 Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Tue, 12 May 2026 18:59:33 +0200
Subject: [PATCH 07/23] fix: rename loop variable to avoid fypp collision

---
 .../stdlib_linalg_iterative_solvers_gmres.fypp                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index f9e83c699..bfd26d12d 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -21,7 +21,7 @@ contains
         ${t}$, intent(inout) :: x(:)
         integer, intent(in) :: maxiter, kdim
         type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace
-        integer :: i, iter, inner_iter, j, k
+        integer :: i, iter, inner_iter, j, iorth
         ${t}$ :: beta, denom, hnext, htmp, norm_sq, norm_sq0, temp, tolsq
         ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), x_base(:), y(:)
 
@@ -71,7 +71,7 @@ contains
                     call A%matvec(z(:,j), w, alpha=one_${s}$, beta=zero_${s}$, op='N')
 
                 ! Modified Gram Schmidt (MGSR) 
-                    do k = 1, 2 ! reorthogonalization
+                    do iorth = 1, 2 ! reorthogonalization
                         do i = 1, j
                             htmp   = A%inner_product(v(:,i), w)
                             h(i,j) = h(i,j) + htmp

From 36f81b58be12ca6eb99b1e26f7684b8d6301993a Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Tue, 12 May 2026 21:17:48 +0200
Subject: [PATCH 08/23] fix: hilbert sp tolerance test

---
 test/linalg/test_linalg_solve_iterative.fypp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index b0ccccc54..eaca901d9 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -138,7 +138,7 @@ module test_linalg_solve_iterative
         #:for k, t, s in R_KINDS_TYPES
         block
             #:if k == 'sp'
-            ${t}$, parameter :: tol = 1.e-2_${k}$
+            ${t}$, parameter :: tol = 1.e-1_${k}$
             #:else
             ${t}$, parameter :: tol = 1.e-5_${k}$
             #:endif

From 87e9f3290dc3fda4737de5b2f61f1ff07485b2ab Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Tue, 12 May 2026 21:49:04 +0200
Subject: [PATCH 09/23] fix: hilbert sp tolerance test

---
 test/linalg/test_linalg_solve_iterative.fypp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index eaca901d9..42fc6aaeb 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -138,7 +138,7 @@ module test_linalg_solve_iterative
         #:for k, t, s in R_KINDS_TYPES
         block
             #:if k == 'sp'
-            ${t}$, parameter :: tol = 1.e-1_${k}$
+            ${t}$, parameter :: tol = 1.e-0_${k}$
             #:else
             ${t}$, parameter :: tol = 1.e-5_${k}$
             #:endif

From b9a99e1d90e909a41d55327cba8ba0b86d1141b2 Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Wed, 13 May 2026 10:30:37 +0200
Subject: [PATCH 10/23] fix: hilbert dp tolerance test

---
 test/linalg/test_linalg_solve_iterative.fypp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 42fc6aaeb..8eb889047 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -137,10 +137,11 @@ module test_linalg_solve_iterative
         integer :: r, c
         #:for k, t, s in R_KINDS_TYPES
         block
-            #:if k == 'sp'
+            !Hilbert sp is almost useless. 
+            #:if k == 'sp' 
             ${t}$, parameter :: tol = 1.e-0_${k}$
             #:else
-            ${t}$, parameter :: tol = 1.e-5_${k}$
+            ${t}$, parameter :: tol = 1.e-4_${k}$
             #:endif
             
             ${t}$ :: A(n, n), b(n), x(n), x_ref(n)

From 5fa5583ef84effa34b4cd81c8b956f59cd842e0a Mon Sep 17 00:00:00 2001
From: AlexanderGSC <alexander.gsc@gmail.com>
Date: Wed, 13 May 2026 11:04:25 +0200
Subject: [PATCH 11/23] fix: hilbert dp tolerance test for Intel 2024 1 cmake

---
 test/linalg/test_linalg_solve_iterative.fypp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 8eb889047..1c9e8c9f9 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -141,7 +141,7 @@ module test_linalg_solve_iterative
             #:if k == 'sp' 
             ${t}$, parameter :: tol = 1.e-0_${k}$
             #:else
-            ${t}$, parameter :: tol = 1.e-4_${k}$
+            ${t}$, parameter :: tol = 1.e-3_${k}$
             #:endif
             
             ${t}$ :: A(n, n), b(n), x(n), x_ref(n)

From 54769fef2d65ebc0a56b5f7eb6db8234614c320a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Alves?=
 <102541118+jalvesz@users.noreply.github.com>
Date: Mon, 18 May 2026 08:47:57 +0200
Subject: [PATCH 12/23] Update test/linalg/test_linalg_solve_iterative.fypp

Co-authored-by: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
---
 test/linalg/test_linalg_solve_iterative.fypp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 1c9e8c9f9..839e98dcf 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -92,9 +92,8 @@ module test_linalg_solve_iterative
                                            0,  4, -1,  1, &
                                            1,  0,  3,  2, &
                                            2, -1,  0,  6], [4,4])
-        ${t}$ :: x(4), load(4), xref(4)
-
-        xref = [1.0_${k}$, 2.0_${k}$, -1.0_${k}$, 0.5_${k}$]
+        ${t}$ :: x(4), load(4)
+        ${t}$, parameter :: xref(*) = [1.0_${k}$, 2.0_${k}$, -1.0_${k}$, 0.5_${k}$]
         load = matmul(A, xref)
         x = 0.0_${k}$
 

From cce736827e85cbf3454659261a081991bda0c54c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Alves?=
 <102541118+jalvesz@users.noreply.github.com>
Date: Mon, 18 May 2026 08:48:09 +0200
Subject: [PATCH 13/23] Update test/linalg/test_linalg_solve_iterative.fypp

Co-authored-by: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
---
 test/linalg/test_linalg_solve_iterative.fypp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 839e98dcf..f43f121e2 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -108,9 +108,9 @@ module test_linalg_solve_iterative
         ${t}$ :: A(3,3) = reshape([${t}$ :: 10,  1,  2, &
                                             1, 10,  3, &
                                             2,  3, 10], [3,3])
-        ${t}$ :: x(3), load(3), xref(3)
-
-        xref = [(137._${k}$/218._${k}$), -(9._${k}$/109._${k}$), (87._${k}$/218._${k}$)]
+        ${t}$ :: x(3)
+        ${t}$, parameter :: xref(:) = [(137._${k}$/218._${k}$), -(9._${k}$/109._${k}$), (87._${k}$/218._${k}$)]
+        ${t}$, parameter :: load(*) = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
         load = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
         x = [0.2_${k}$, -0.1_${k}$, 0.3_${k}$]
 

From 450e7c04db9ddeafd9471e09406d5ad2d24cdf5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Alves?=
 <102541118+jalvesz@users.noreply.github.com>
Date: Mon, 18 May 2026 08:48:30 +0200
Subject: [PATCH 14/23] Update test/linalg/test_linalg_solve_iterative.fypp

Co-authored-by: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
---
 test/linalg/test_linalg_solve_iterative.fypp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index f43f121e2..1fa83a5e1 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -105,7 +105,7 @@ module test_linalg_solve_iterative
 
         block
         ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
-        ${t}$ :: A(3,3) = reshape([${t}$ :: 10,  1,  2, &
+        ${t}$, parameter :: A(3,3) = reshape([${t}$ :: 10,  1,  2, &
                                             1, 10,  3, &
                                             2,  3, 10], [3,3])
         ${t}$ :: x(3)

From b35926e057fc6e2d2b9883a9c18e9c3a81e866f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Alves?=
 <102541118+jalvesz@users.noreply.github.com>
Date: Mon, 18 May 2026 08:48:41 +0200
Subject: [PATCH 15/23] Update test/linalg/test_linalg_solve_iterative.fypp

Co-authored-by: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
---
 test/linalg/test_linalg_solve_iterative.fypp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 1fa83a5e1..374551d71 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -88,7 +88,7 @@ module test_linalg_solve_iterative
         #:for k, t, s in R_KINDS_TYPES
         block
         ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
-        ${t}$ :: A(4,4) = reshape([${t}$ :: 5,  1,  2,  0, &
+        ${t}$, parameter :: A(4,4) = reshape([${t}$ :: 5,  1,  2,  0, &
                                            0,  4, -1,  1, &
                                            1,  0,  3,  2, &
                                            2, -1,  0,  6], [4,4])

From 902054c4285291407532a63e90142e885939045d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Alves?=
 <102541118+jalvesz@users.noreply.github.com>
Date: Mon, 18 May 2026 08:48:53 +0200
Subject: [PATCH 16/23] Update test/linalg/test_linalg_solve_iterative.fypp

Co-authored-by: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
---
 test/linalg/test_linalg_solve_iterative.fypp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 374551d71..6fb93ea49 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -143,7 +143,8 @@ module test_linalg_solve_iterative
             ${t}$, parameter :: tol = 1.e-3_${k}$
             #:endif
             
-            ${t}$ :: A(n, n), b(n), x(n), x_ref(n)
+            ${t}$ :: A(n, n), b(n), x(n)
+            ${t}$, parameter :: x_ref(n) = 1.0_${k}$
             do r = 1, n
                 do c = 1, n
                     A(r, c) = 1.0_${k}$ / real(r + c - 1, ${k}$)

From 825fd51e31d85f0ad03e822d04614e87a63fd4a8 Mon Sep 17 00:00:00 2001
From: Jose Alves <jose.alves@transvalor.com>
Date: Mon, 18 May 2026 09:53:15 +0200
Subject: [PATCH 17/23] fix declarations

---
 test/linalg/test_linalg_solve_iterative.fypp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 6fb93ea49..6b2b07016 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -109,9 +109,8 @@ module test_linalg_solve_iterative
                                             1, 10,  3, &
                                             2,  3, 10], [3,3])
         ${t}$ :: x(3)
-        ${t}$, parameter :: xref(:) = [(137._${k}$/218._${k}$), -(9._${k}$/109._${k}$), (87._${k}$/218._${k}$)]
-        ${t}$, parameter :: load(*) = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
-        load = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
+        ${t}$, parameter :: xref(3) = [(137._${k}$/218._${k}$), -(9._${k}$/109._${k}$), (87._${k}$/218._${k}$)]
+        ${t}$, parameter :: load(3) = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
         x = [0.2_${k}$, -0.1_${k}$, 0.3_${k}$]
 
         call stdlib_solve_gmres(A, load, x, rtol=1.e-10_${k}$, precond=pc_jacobi)
@@ -124,13 +123,13 @@ module test_linalg_solve_iterative
     end subroutine test_stdlib_solve_gmres
 
     subroutine test_stdlib_solve_gmres_hilbert(error)
-    ! This test uses a Hilbert matrix (n=10) to validate the numerical stability 
-    ! of the GMRES solver. The Hilbert matrix is notoriously ill-conditioned 
-    ! (cond(A) ~ 10^13 for n=10), making it an excellent benchmark for 
-    ! orthogonality maintenance. By setting rtol=0 and maxiter=n, we force 
-    ! the construction of the full Krylov basis, which only converges to 
-    ! the reference solution if the MGS with reorthogonalization 
-    ! preserves the basis' orthogonality.
+        ! This test uses a Hilbert matrix (n=10) to validate the numerical stability 
+        ! of the GMRES solver. The Hilbert matrix is notoriously ill-conditioned 
+        ! (cond(A) ~ 10^13 for n=10), making it an excellent benchmark for 
+        ! orthogonality maintenance. By setting rtol=0 and maxiter=n, we force 
+        ! the construction of the full Krylov basis, which only converges to 
+        ! the reference solution if the MGS with reorthogonalization 
+        ! preserves the basis' orthogonality.
         type(error_type), allocatable, intent(out) :: error
         integer, parameter :: n = 10
         integer :: r, c
@@ -151,7 +150,6 @@ module test_linalg_solve_iterative
                 end do
             end do
 
-            x_ref = 1.0_${k}$
             b = matmul(A, x_ref)
             x = 0.0_${k}$
 

From 6ebfe6bc9e6cbd9af67424a9d1b7e053799611ec Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Mon, 15 Jun 2026 21:59:28 +0200
Subject: [PATCH 18/23] make gmres implementation flexible by allowing
 switching between memory efficiency or runtime speed

---
 doc/specs/stdlib_linalg_iterative_solvers.md  | 94 ++++++++++++++++++-
 src/linalg_iterative/CMakeLists.txt           |  6 +-
 .../stdlib_linalg_iterative_solvers.fypp      |  6 +-
 ...stdlib_linalg_iterative_solvers_gmres.fypp | 78 ++++++++-------
 test/linalg/test_linalg_solve_iterative.fypp  | 16 ++++
 5 files changed, 164 insertions(+), 36 deletions(-)

diff --git a/doc/specs/stdlib_linalg_iterative_solvers.md b/doc/specs/stdlib_linalg_iterative_solvers.md
index 6cebb61aa..9a856029d 100644
--- a/doc/specs/stdlib_linalg_iterative_solvers.md
+++ b/doc/specs/stdlib_linalg_iterative_solvers.md
@@ -348,4 +348,96 @@ The method uses 8 auxiliary vectors internally, requiring more memory than simpl
 #### Example 2
 ```fortran
 {!example/linalg/example_solve_bicgstab_wilkinson.f90!}
-```
\ No newline at end of file
+```
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `stdlib_solve_gmres_kernel` subroutine
+
+#### Description
+
+Implements the restarted Generalized Minimal Residual (GMRES) method for solving the linear system \( Ax = b \), where \( A \) is a general linear operator defined via the `stdlib_linop` type. This is the core implementation, allowing custom matrix types, custom preconditioners, and user-managed workspace.
+
+#### Syntax
+
+`call ` [[stdlib_linalg_iterative_solvers(module):stdlib_solve_gmres_kernel(interface)]] ` (A, M, b, x, rtol, atol, maxiter, kdim, workspace, compact)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Subroutine
+
+#### Argument(s)
+
+`A`: `class(stdlib_linop_<kind>_type)` defining the linear operator. This argument is `intent(in)`.
+
+`M`: `class(stdlib_linop_<kind>_type)` defining the preconditioner linear operator. This argument is `intent(in)`.
+
+`b`: 1-D array of `real(<kind>)` defining the loading conditions of the linear system. This argument is `intent(in)`.
+
+`x`: 1-D array of `real(<kind>)` which serves as the input initial guess and the output solution. This argument is `intent(inout)`.
+
+`rtol` and `atol`: scalars of type `real(<kind>)` specifying the convergence test. For convergence, the following criterion is used \( || b - Ax ||^2 <= max(rtol^2 * || b ||^2 , atol^2 ) \). These arguments are `intent(in)`.
+
+`maxiter`: scalar of type `integer` defining the maximum allowed number of iterations. This argument is `intent(in)`.
+
+`kdim`: scalar of type `integer` defining the Krylov subspace size before restart. This argument is `intent(in)`.
+
+`workspace`: scalar derived type of `type(stdlib_solver_workspace_<kind>_type)` holding the work array for the solver. This argument is `intent(inout)`.
+
+`compact`: scalar of type `logical`. If true, GMRES stores only one preconditioned basis vector at a time and rebuilds the cycle-end correction through the preconditioner, reducing memory usage. If false, GMRES caches the full preconditioned basis for a faster cycle-end update at the cost of additional memory. Default is `.true.`. This argument is `intent(in)`.
+
+#### Note
+
+GMRES trades increasing memory and orthogonalization cost for robust convergence on general non-symmetric systems. The `compact` option allows choosing between a lower-memory layout and a slightly faster restart update.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `stdlib_solve_gmres` subroutine
+
+#### Description
+
+Provides a user-friendly interface to the restarted GMRES method for solving \( Ax = b \), supporting `dense` and `CSR_<kind>_type` matrices. GMRES is suitable for general non-symmetric systems and supports optional preconditioning, workspace reuse, and a storage-mode choice for balancing memory use against cycle-end update speed.
+
+#### Syntax
+
+`call ` [[stdlib_linalg_iterative_solvers(module):stdlib_solve_gmres(interface)]] ` (A, b, x [, di, rtol, atol, maxiter, restart, kdim, precond, M, workspace, compact])`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Subroutine
+
+#### Argument(s)
+
+`A`: `dense` or `CSR_<kind>_type` matrix defining the linear system. This argument is `intent(in)`.
+
+`b`: 1-D array of `real(<kind>)` defining the loading conditions of the linear system. This argument is `intent(in)`.
+
+`x`: 1-D array of `real(<kind>)` which serves as the input initial guess and the output solution. This argument is `intent(inout)`.
+
+`di` (optional): 1-D mask array of type `logical(int8)` defining the degrees of freedom subject to dirichlet boundary conditions. The actual boundary condition values should be stored in the `b` load array. This argument is `intent(in)`.
+
+`rtol` and `atol` (optional): scalars of type `real(<kind>)` specifying the convergence test. For convergence, the following criterion is used \( || b - Ax ||^2 <= max(rtol^2 * || b ||^2 , atol^2 ) \). Default values are `rtol=1.e-5` and `atol=epsilon(1._<kind>)`. These arguments are `intent(in)`.
+
+`maxiter` (optional): scalar of type `integer` defining the maximum allowed number of iterations. If no value is given, a default of `N` is set, where `N = size(b)`. This argument is `intent(in)`.
+
+`restart` (optional): scalar of type `logical` indicating whether to restart the iteration with zero initial guess. Default is `.true.`. This argument is `intent(in)`.
+
+`kdim` (optional): scalar of type `integer` defining the Krylov subspace size before restart. If no value is given, a default of `min(30, N)` is used. This argument is `intent(in)`.
+
+`precond` (optional): scalar of type `integer` enabling to switch among the default preconditioners available with the following enum (`pc_none`, `pc_jacobi`). If no value is given, no preconditioning will be applied. This argument is `intent(in)`.
+
+`M` (optional): scalar derived type of `class(stdlib_linop_<kind>_type)` defining a custom preconditioner linear operator. If given, `precond` will have no effect, and a pointer is set to this custom preconditioner. This argument is `intent(in)`.
+
+`workspace` (optional): scalar derived type of `type(stdlib_solver_workspace_<kind>_type)` holding the work temporal array for the solver. If the user passes its own `workspace`, then internally a pointer is set to it, otherwise, memory will be internally allocated and deallocated before exiting the procedure. This argument is `intent(inout)`.
+
+`compact` (optional): scalar of type `logical`. If true, GMRES stores only one preconditioned basis vector per iteration cycle and minimizes memory use. If false, GMRES caches the full preconditioned basis to speed up the cycle-end update. Default is `.true.`. This argument is `intent(in)`.
+
+#### Note
+
+GMRES is especially useful for general non-symmetric systems where CG and PCG do not apply. Its main cost is the Krylov basis storage and orthogonalization work within each restart cycle. With `compact=.true.`, the solver reduces workspace requirements significantly. With `compact=.false.`, it retains the previous cached-basis update strategy.
\ No newline at end of file
diff --git a/src/linalg_iterative/CMakeLists.txt b/src/linalg_iterative/CMakeLists.txt
index 989b1c951..adea11ca4 100644
--- a/src/linalg_iterative/CMakeLists.txt
+++ b/src/linalg_iterative/CMakeLists.txt
@@ -14,4 +14,8 @@ set(linalg_iterative_f90Files
 
 configure_stdlib_target(${PROJECT_NAME}_linalg_iterative linalg_iterative_f90Files linalg_iterative_fppFiles linalg_iterative_cppFiles)
 
-target_link_libraries(${PROJECT_NAME}_linalg_iterative PUBLIC ${PROJECT_NAME}_linalg ${PROJECT_NAME}_sparse)
+target_link_libraries(${PROJECT_NAME}_linalg_iterative 
+                        PUBLIC 
+                        ${PROJECT_NAME}_blas
+                        ${PROJECT_NAME}_linalg 
+                        ${PROJECT_NAME}_sparse)
diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
index f3d2a43fe..b91c19246 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
@@ -168,7 +168,7 @@ module stdlib_linalg_iterative_solvers
     !! [Specifications](../page/specs/stdlib_linalg_iterative_solvers.html#stdlib_solve_gmres_kernel)
     interface stdlib_solve_gmres_kernel
         #:for k, t, s in R_KINDS_TYPES
-        module subroutine stdlib_solve_gmres_kernel_${s}$(A,M,b,x,rtol,atol,maxiter,kdim,workspace)
+        module subroutine stdlib_solve_gmres_kernel_${s}$(A,M,b,x,rtol,atol,maxiter,kdim,workspace,compact)
             class(stdlib_linop_${s}$_type), intent(in) :: A !! linear operator
             class(stdlib_linop_${s}$_type), intent(in) :: M !! preconditioner linear operator
             ${t}$, intent(in) :: b(:) !! right-hand side vector
@@ -178,6 +178,7 @@ module stdlib_linalg_iterative_solvers
             integer, intent(in) :: maxiter !! maximum number of iterations
             integer, intent(in) :: kdim !! Krylov subspace size before restart
             type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace !! workspace for the solver
+            logical, intent(in) :: compact !! keep only one preconditioned basis vector and rebuild the cycle update at restart
         end subroutine
         #:endfor
     end interface
@@ -247,7 +248,7 @@ module stdlib_linalg_iterative_solvers
     interface stdlib_solve_gmres
         #:for matrix in MATRIX_TYPES
         #:for k, t, s in R_KINDS_TYPES
-        module subroutine stdlib_solve_gmres_${matrix}$_${s}$(A,b,x,di,rtol,atol,maxiter,restart,kdim,precond,M,workspace)
+        module subroutine stdlib_solve_gmres_${matrix}$_${s}$(A,b,x,di,rtol,atol,maxiter,restart,kdim,precond,M,workspace,compact)
             !! linear operator matrix
             #:if matrix == "dense"
             ${t}$, intent(in) :: A(:,:)
@@ -265,6 +266,7 @@ module stdlib_linalg_iterative_solvers
             integer, intent(in), optional :: precond !! preconditioner method enumerator
             class(stdlib_linop_${s}$_type), optional, intent(in), target :: M !! preconditioner linear operator
             type(stdlib_solver_workspace_${s}$_type), optional, intent(inout), target :: workspace !! workspace for the solver
+            logical, intent(in), optional :: compact !! if true, favor lower memory use over cycle-end update speed
         end subroutine
         #:endfor
         #:endfor
diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index bfd26d12d..013269eb2 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -9,29 +9,33 @@ submodule(stdlib_linalg_iterative_solvers) stdlib_linalg_iterative_gmres
     use stdlib_sparse
     use stdlib_constants
     use stdlib_optval, only: optval
+    use stdlib_linalg_blas, only: gemv
     implicit none
 
 contains
 
     #:for k, t, s in R_KINDS_TYPES
-    module subroutine stdlib_solve_gmres_kernel_${s}$(A,M,b,x,rtol,atol,maxiter,kdim,workspace)
+    module subroutine stdlib_solve_gmres_kernel_${s}$(A,M,b,x,rtol,atol,maxiter,kdim,workspace,compact)
         class(stdlib_linop_${s}$_type), intent(in) :: A
         class(stdlib_linop_${s}$_type), intent(in) :: M
         ${t}$, intent(in) :: b(:), rtol, atol
         ${t}$, intent(inout) :: x(:)
         integer, intent(in) :: maxiter, kdim
         type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace
-        integer :: i, iter, inner_iter, j, iorth
+        logical, intent(in) :: compact
+        integer :: i, iter, inner_iter, j, j_final, iorth, jz, zbase
         ${t}$ :: beta, denom, hnext, htmp, norm_sq, norm_sq0, temp, tolsq
-        ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), x_base(:), y(:)
+        ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), y(:)
 
-        allocate(h(kdim+1, kdim), cs(kdim), sn(kdim), g(kdim+1), y(kdim), x_base(size(x)))
+        allocate(h(kdim+1, kdim), cs(kdim), sn(kdim), g(kdim+1), y(kdim) )
+        zbase = kdim + 4
 
         associate( r => workspace%tmp(:,1), &
-                   w => workspace%tmp(:,2), &
-                   v => workspace%tmp(:,3:kdim+3), &
-                   z => workspace%tmp(:,kdim+4:2*kdim+3) )
-
+               w => workspace%tmp(:,2), &
+               v => workspace%tmp(:,3:kdim+3), &
+               z => workspace%tmp(:,zbase:zbase+merge(0,kdim-1,compact)) )
+            
+            iter = 0
             ! Initialize convergence targets from the right-hand side norm.
             norm_sq0 = A%inner_product(b, b)
             tolsq = max(rtol*rtol*norm_sq0, atol*atol)
@@ -40,21 +44,16 @@ contains
             r = b
             call A%matvec(x, r, alpha=-one_${s}$, beta=one_${s}$, op='N')
             norm_sq = A%inner_product(r, r)
-            if (associated(workspace%callback)) call workspace%callback(x, norm_sq, 0)
+            if (associated(workspace%callback)) call workspace%callback(x, norm_sq, iter)
 
-            if (norm_sq <= tolsq .or. maxiter <= 0) then
-                deallocate(h, cs, sn, g, y, x_base)
-                return
-            end if
+            if (norm_sq <= tolsq) return
 
-            iter = 0
             do while (iter < maxiter .and. norm_sq >= tolsq)
                 ! Start a new GMRES cycle from the current residual.
                 beta = sqrt(max(norm_sq, zero_${s}$))
                 if (beta <= epsilon(one_${s}$)) exit
 
                 inner_iter = min(kdim, maxiter - iter)
-                x_base = x
                 h = zero_${s}$
                 cs = zero_${s}$
                 sn = zero_${s}$
@@ -65,12 +64,14 @@ contains
                 v(:,1) = r / beta
                 g(1) = beta
 
+                j_final = 0
                 do j = 1, inner_iter
                     ! Run Arnoldi with the preconditioned basis vector.
-                    call M%matvec(v(:,j), z(:,j), alpha=one_${s}$, beta=zero_${s}$, op='N')
-                    call A%matvec(z(:,j), w, alpha=one_${s}$, beta=zero_${s}$, op='N')
+                    jz = merge(1, j, compact)
+                    call M%matvec(v(:,j), z(:,jz), alpha=one_${s}$, beta=zero_${s}$, op='N')
+                    call A%matvec(z(:,jz), w, alpha=one_${s}$, beta=zero_${s}$, op='N')
 
-                ! Modified Gram Schmidt (MGSR) 
+                    ! Modified Gram Schmidt (MGSR) 
                     do iorth = 1, 2 ! reorthogonalization
                         do i = 1, j
                             htmp   = A%inner_product(v(:,i), w)
@@ -112,21 +113,34 @@ contains
                     g(j+1) = -sn(j) * g(j) + cs(j) * g(j+1)
                     g(j) = temp
 
-                    ! Solve the reduced system and update the current iterate.
-                    call upper_triangular_solve(h, g, y, j)
-                    x = x_base
-                    do i = 1, j
-                        x = x + y(i) * z(:,i)
-                    end do
-
-                    ! Track the residual norm estimate for convergence.
+                    ! Cheap residual-norm estimate; no solution rebuild needed.
                     norm_sq = g(j+1) * g(j+1)
                     iter = iter + 1
+                    j_final = j
+                    ! x is kept constant for the logger through out the inner iteration
                     if (associated(workspace%callback)) call workspace%callback(x, norm_sq, iter)
 
                     if (norm_sq < tolsq .or. hnext <= epsilon(one_${s}$) .or. iter >= maxiter) exit
                 end do
 
+                ! Cycle-end update from the least-squares correction.
+                if (j_final > 0) then
+                    call upper_triangular_solve(h, g, y, j_final)
+                    if (compact) then
+                        call gemv('N', m=size(x), n=j_final, alpha=one_${s}$, &
+                            a=v, lda=size(v,1), &
+                            x=y, incx=1, &
+                            beta=zero_${s}$, y=w, incy=1)
+                        call M%matvec(w, z(:,1), alpha=one_${s}$, beta=zero_${s}$, op='N')
+                        x = x + z(:,1)
+                    else
+                        call gemv('N', m=size(x), n=j_final, alpha=one_${s}$, &
+                            a=z, lda=size(z,1), &
+                            x=y, incx=1, &
+                            beta=one_${s}$, y=x, incy=1)
+                    end if
+                end if
+
                 if (norm_sq < tolsq .or. iter >= maxiter) exit
 
                 ! Refresh the residual before the next restarted cycle.
@@ -136,8 +150,6 @@ contains
             end do
         end associate
 
-        deallocate(h, cs, sn, g, y, x_base)
-
     contains
 
         subroutine upper_triangular_solve(h, g, y, n)
@@ -161,7 +173,7 @@ contains
 
     #:for matrix in MATRIX_TYPES
     #:for k, t, s in R_KINDS_TYPES
-    module subroutine stdlib_solve_gmres_${matrix}$_${s}$(A,b,x,di,rtol,atol,maxiter,restart,kdim,precond,M,workspace)
+    module subroutine stdlib_solve_gmres_${matrix}$_${s}$(A,b,x,di,rtol,atol,maxiter,restart,kdim,precond,M,workspace,compact)
         #:if matrix == "dense"
         use stdlib_linalg, only: diag
         ${t}$, intent(in) :: A(:,:)
@@ -177,12 +189,13 @@ contains
         integer, intent(in), optional :: precond
         class(stdlib_linop_${s}$_type), optional, intent(in), target :: M
         type(stdlib_solver_workspace_${s}$_type), optional, intent(inout), target :: workspace
+        logical, intent(in), optional :: compact
         type(stdlib_linop_${s}$_type) :: op
         type(stdlib_linop_${s}$_type), pointer :: M_ => null()
         type(stdlib_solver_workspace_${s}$_type), pointer :: workspace_
         integer :: kdim_, maxiter_, n, ncols, precond_
         ${t}$ :: rtol_, atol_
-        logical :: restart_
+        logical :: compact_, restart_
         logical(int8), pointer :: di_(:)
         ${t}$, allocatable :: diagonal(:)
 
@@ -190,10 +203,11 @@ contains
         maxiter_ = optval(x=maxiter, default=n)
         kdim_ = max(1, min(optval(x=kdim, default=min(30, n)), n))
         restart_ = optval(x=restart, default=.true.)
+        compact_ = optval(x=compact, default=.true.)
         rtol_ = optval(x=rtol, default=1.e-5_${s}$)
         atol_ = optval(x=atol, default=epsilon(one_${s}$))
         precond_ = optval(x=precond, default=pc_none)
-        ncols = 2 * kdim_ + stdlib_size_wksp_gmres
+        ncols = kdim_ + stdlib_size_wksp_gmres + merge(1, kdim_, compact_)
 
         if (present(M)) then
             M_ => M
@@ -237,7 +251,7 @@ contains
 
         if (restart_) x = zero_${s}$
         x = merge(b, x, di_)
-        call stdlib_solve_gmres_kernel(op, M_, b, x, rtol_, atol_, maxiter_, kdim_, workspace_)
+        call stdlib_solve_gmres_kernel(op, M_, b, x, rtol_, atol_, maxiter_, kdim_, workspace_, compact=compact_)
 
         if (.not.present(di)) deallocate(di_)
         di_ => null()
diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index 6b2b07016..e3c47b32c 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -119,6 +119,22 @@ module test_linalg_solve_iterative
         if (allocated(error)) return
         end block
 
+        block
+        ${t}$, parameter :: tol = 1000*epsilon(0.0_${k}$)
+        ${t}$, parameter :: A(3,3) = reshape([${t}$ :: 10,  1,  2, &
+                                            1, 10,  3, &
+                                            2,  3, 10], [3,3])
+        ${t}$ :: x(3)
+        ${t}$, parameter :: xref(3) = [(137._${k}$/218._${k}$), -(9._${k}$/109._${k}$), (87._${k}$/218._${k}$)]
+        ${t}$, parameter :: load(3) = [7.0_${k}$, 1.0_${k}$, 5.0_${k}$]
+        x = [0.2_${k}$, -0.1_${k}$, 0.3_${k}$]
+
+        call stdlib_solve_gmres(A, load, x, rtol=1.e-10_${k}$, precond=pc_jacobi, compact=.false.)
+
+        call check(error, norm2(x-xref)<tol*norm2(xref), 'error in GMRES solver (Jacobi preconditioned speed mode test)')
+        if (allocated(error)) return
+        end block
+
         #:endfor
     end subroutine test_stdlib_solve_gmres
 

From 7dab12da4a87f022e68bbc0432ee56d98bd4edcb Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Mon, 15 Jun 2026 22:05:46 +0200
Subject: [PATCH 19/23] add example

---
 doc/specs/stdlib_linalg_iterative_solvers.md |  8 ++++++-
 example/linalg/CMakeLists.txt                |  1 +
 example/linalg/example_solve_gmres.f90       | 24 ++++++++++++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 example/linalg/example_solve_gmres.f90

diff --git a/doc/specs/stdlib_linalg_iterative_solvers.md b/doc/specs/stdlib_linalg_iterative_solvers.md
index 9a856029d..f2c9173b9 100644
--- a/doc/specs/stdlib_linalg_iterative_solvers.md
+++ b/doc/specs/stdlib_linalg_iterative_solvers.md
@@ -440,4 +440,10 @@ Subroutine
 
 #### Note
 
-GMRES is especially useful for general non-symmetric systems where CG and PCG do not apply. Its main cost is the Krylov basis storage and orthogonalization work within each restart cycle. With `compact=.true.`, the solver reduces workspace requirements significantly. With `compact=.false.`, it retains the previous cached-basis update strategy.
\ No newline at end of file
+GMRES is especially useful for general non-symmetric systems where CG and PCG do not apply. Its main cost is the Krylov basis storage and orthogonalization work within each restart cycle. With `compact=.true.`, the solver reduces workspace requirements significantly. With `compact=.false.`, it retains the previous cached-basis update strategy.
+
+#### Example
+
+```fortran
+{!example/linalg/example_solve_gmres.f90!}
+```
\ No newline at end of file
diff --git a/example/linalg/CMakeLists.txt b/example/linalg/CMakeLists.txt
index dd65b48e4..ae2e0f6f3 100644
--- a/example/linalg/CMakeLists.txt
+++ b/example/linalg/CMakeLists.txt
@@ -48,6 +48,7 @@ if (STDLIB_LINALG_ITERATIVE)
   ADD_EXAMPLE(solve_bicgstab)
   ADD_EXAMPLE(solve_bicgstab_wilkinson)
   ADD_EXAMPLE(solve_cg)
+  ADD_EXAMPLE(solve_gmres)
   ADD_EXAMPLE(solve_pcg)
   ADD_EXAMPLE(solve_custom)
 endif()
diff --git a/example/linalg/example_solve_gmres.f90 b/example/linalg/example_solve_gmres.f90
new file mode 100644
index 000000000..02ea8ad12
--- /dev/null
+++ b/example/linalg/example_solve_gmres.f90
@@ -0,0 +1,24 @@
+program example_solve_gmres
+    use stdlib_kinds, only: dp
+    use stdlib_linalg_iterative_solvers, only: stdlib_solve_gmres, pc_jacobi
+    implicit none
+
+    real(dp), parameter :: A(4,4) = reshape([5.0_dp,  1.0_dp,  2.0_dp, 0.0_dp, &
+                                             0.0_dp,  4.0_dp, -1.0_dp, 1.0_dp, &
+                                             1.0_dp,  0.0_dp,  3.0_dp, 2.0_dp, &
+                                             2.0_dp, -1.0_dp,  0.0_dp, 6.0_dp], [4,4])
+    real(dp), parameter :: x0(4) = [0.2_dp, -0.1_dp, 0.3_dp, 0.0_dp]
+    real(dp), parameter :: xref(4) = [1.0_dp, 2.0_dp, -1.0_dp, 0.5_dp]
+    real(dp) :: rhs(4), x(4)
+
+    rhs = matmul(A, xref)
+
+    ! GMRES lets the user tune the restart size and the storage mode.
+    x = x0
+    call stdlib_solve_gmres(A, rhs, x, restart=.false., kdim=2, maxiter=12, precond=pc_jacobi)
+    print *, 'compact:', x
+
+    x = x0
+    call stdlib_solve_gmres(A, rhs, x, restart=.false., kdim=2, maxiter=12, precond=pc_jacobi, compact=.false.)
+    print *, 'cached :', x
+end program example_solve_gmres
\ No newline at end of file

From 3ba5a4f7255081f8c776a6cbda9b69bb6ee91c41 Mon Sep 17 00:00:00 2001
From: Jose Alves <jose.alves@transvalor.com>
Date: Tue, 16 Jun 2026 08:08:56 +0200
Subject: [PATCH 20/23] convert static enum to integer helper function to
 determine the number of colums of the gmres workspace

---
 .../stdlib_linalg_iterative_solvers.fypp               | 10 +++++++++-
 .../stdlib_linalg_iterative_solvers_gmres.fypp         |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
index b91c19246..9bc1b6f8f 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers.fypp
@@ -16,7 +16,6 @@ module stdlib_linalg_iterative_solvers
         enumerator :: stdlib_size_wksp_cg = 3
         enumerator :: stdlib_size_wksp_pcg = 4
         enumerator :: stdlib_size_wksp_bicgstab = 8
-        enumerator :: stdlib_size_wksp_gmres = 3
     end enum
     public :: stdlib_size_wksp_cg, stdlib_size_wksp_pcg, stdlib_size_wksp_bicgstab, stdlib_size_wksp_gmres
 
@@ -274,6 +273,15 @@ module stdlib_linalg_iterative_solvers
     public :: stdlib_solve_gmres
 
 contains
+    
+    !------------------------------------------------------------------
+    ! helpers
+    !------------------------------------------------------------------
+    integer(int32) function stdlib_size_wksp_gmres(kdim,compact)
+        integer(int32), intent(in) :: kdim
+        logical, intent(in) :: compact
+        stdlib_size_wksp_gmres = 3 + kdim + merge(1, kdim, compact)
+    end function
 
     !------------------------------------------------------------------
     ! defaults
diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index 013269eb2..24ce77647 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -207,7 +207,7 @@ contains
         rtol_ = optval(x=rtol, default=1.e-5_${s}$)
         atol_ = optval(x=atol, default=epsilon(one_${s}$))
         precond_ = optval(x=precond, default=pc_none)
-        ncols = kdim_ + stdlib_size_wksp_gmres + merge(1, kdim_, compact_)
+        ncols = stdlib_size_wksp_gmres(kdim_,compact_)
 
         if (present(M)) then
             M_ => M

From 0069cd33269c7b791eec1eb4c1f41cceaab8b334 Mon Sep 17 00:00:00 2001
From: Jose Alves <jose.alves@transvalor.com>
Date: Tue, 16 Jun 2026 17:19:41 +0200
Subject: [PATCH 21/23] simplify iterations logic

---
 .../stdlib_linalg_iterative_solvers_gmres.fypp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index 24ce77647..31c4ce79e 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -23,7 +23,7 @@ contains
         integer, intent(in) :: maxiter, kdim
         type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace
         logical, intent(in) :: compact
-        integer :: i, iter, inner_iter, j, j_final, iorth, jz, zbase
+        integer :: i, iter, j, j_final, iorth, jz, zbase
         ${t}$ :: beta, denom, hnext, htmp, norm_sq, norm_sq0, temp, tolsq
         ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), y(:)
 
@@ -49,11 +49,11 @@ contains
             if (norm_sq <= tolsq) return
 
             do while (iter < maxiter .and. norm_sq >= tolsq)
+                iter = iter + 1
                 ! Start a new GMRES cycle from the current residual.
                 beta = sqrt(max(norm_sq, zero_${s}$))
                 if (beta <= epsilon(one_${s}$)) exit
 
-                inner_iter = min(kdim, maxiter - iter)
                 h = zero_${s}$
                 cs = zero_${s}$
                 sn = zero_${s}$
@@ -65,7 +65,7 @@ contains
                 g(1) = beta
 
                 j_final = 0
-                do j = 1, inner_iter
+                do j = 1, kdim
                     ! Run Arnoldi with the preconditioned basis vector.
                     jz = merge(1, j, compact)
                     call M%matvec(v(:,j), z(:,jz), alpha=one_${s}$, beta=zero_${s}$, op='N')
@@ -115,12 +115,9 @@ contains
 
                     ! Cheap residual-norm estimate; no solution rebuild needed.
                     norm_sq = g(j+1) * g(j+1)
-                    iter = iter + 1
                     j_final = j
-                    ! x is kept constant for the logger through out the inner iteration
-                    if (associated(workspace%callback)) call workspace%callback(x, norm_sq, iter)
 
-                    if (norm_sq < tolsq .or. hnext <= epsilon(one_${s}$) .or. iter >= maxiter) exit
+                    if (norm_sq < tolsq .or. hnext <= epsilon(one_${s}$)) exit
                 end do
 
                 ! Cycle-end update from the least-squares correction.
@@ -141,12 +138,13 @@ contains
                     end if
                 end if
 
-                if (norm_sq < tolsq .or. iter >= maxiter) exit
-
-                ! Refresh the residual before the next restarted cycle.
+                ! Refresh the true residual so the convergence test per restart cycle and the logged
+                ! value use the true ||b - A*x||, not the Hessenberg estimate.
                 r = b
                 call A%matvec(x, r, alpha=-one_${s}$, beta=one_${s}$, op='N')
                 norm_sq = A%inner_product(r, r)
+
+                if (associated(workspace%callback)) call workspace%callback(x, norm_sq, iter)
             end do
         end associate
 

From 5099335b8118ee4352ace76221d083243efdcb60 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sat, 20 Jun 2026 13:18:04 +0200
Subject: [PATCH 22/23] replace manual given rotations and the triangular solve
 by lapack kernels

---
 ...stdlib_linalg_iterative_solvers_gmres.fypp | 77 ++++++++++---------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
index 31c4ce79e..0b52f6651 100644
--- a/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
+++ b/src/linalg_iterative/stdlib_linalg_iterative_solvers_gmres.fypp
@@ -10,6 +10,8 @@ submodule(stdlib_linalg_iterative_solvers) stdlib_linalg_iterative_gmres
     use stdlib_constants
     use stdlib_optval, only: optval
     use stdlib_linalg_blas, only: gemv
+    use stdlib_linalg_lapack, only: lartg, lasr, trtrs
+    use stdlib_linalg_constants, only: ilp
     implicit none
 
 contains
@@ -23,17 +25,16 @@ contains
         integer, intent(in) :: maxiter, kdim
         type(stdlib_solver_workspace_${s}$_type), intent(inout) :: workspace
         logical, intent(in) :: compact
-        integer :: i, iter, j, j_final, iorth, jz, zbase
-        ${t}$ :: beta, denom, hnext, htmp, norm_sq, norm_sq0, temp, tolsq
+        integer :: i, iter, j, j_final, iorth, jz, info
+        ${t}$ :: beta, hnext, htmp, norm_sq, norm_sq0, temp, tolsq
         ${t}$, allocatable :: cs(:), g(:), h(:,:), sn(:), y(:)
 
         allocate(h(kdim+1, kdim), cs(kdim), sn(kdim), g(kdim+1), y(kdim) )
-        zbase = kdim + 4
 
         associate( r => workspace%tmp(:,1), &
                w => workspace%tmp(:,2), &
                v => workspace%tmp(:,3:kdim+3), &
-               z => workspace%tmp(:,zbase:zbase+merge(0,kdim-1,compact)) )
+               z => workspace%tmp(:,kdim + 4:) )
             
             iter = 0
             ! Initialize convergence targets from the right-hand side norm.
@@ -88,26 +89,8 @@ contains
                         v(:,j+1) = zero_${s}$
                     end if
 
-                    ! Apply the previously accumulated Givens rotations.
-                    do i = 1, j - 1
-                        temp = cs(i) * h(i,j) + sn(i) * h(i+1,j)
-                        h(i+1,j) = -sn(i) * h(i,j) + cs(i) * h(i+1,j)
-                        h(i,j) = temp
-                    end do
-
-                    ! Build and apply the next Givens rotation.
-                    denom = sqrt(h(j,j) * h(j,j) + h(j+1,j) * h(j+1,j))
-                    if (denom > epsilon(one_${s}$)) then
-                        cs(j) = h(j,j) / denom
-                        sn(j) = h(j+1,j) / denom
-                    else
-                        cs(j) = one_${s}$
-                        sn(j) = zero_${s}$
-                    end if
-
-                    temp = cs(j) * h(j,j) + sn(j) * h(j+1,j)
-                    h(j+1,j) = -sn(j) * h(j,j) + cs(j) * h(j+1,j)
-                    h(j,j) = temp
+                    ! Apply previous rotations to the new column, then generate the next one.
+                    call apply_givens_rotation(h(1:j+1,j), cs, sn)
 
                     temp = cs(j) * g(j) + sn(j) * g(j+1)
                     g(j+1) = -sn(j) * g(j) + cs(j) * g(j+1)
@@ -122,7 +105,9 @@ contains
 
                 ! Cycle-end update from the least-squares correction.
                 if (j_final > 0) then
-                    call upper_triangular_solve(h, g, y, j_final)
+                    call upper_triangular_solve(h, g, y, j_final, info)
+                    if(info /= 0) exit
+
                     if (compact) then
                         call gemv('N', m=size(x), n=j_final, alpha=one_${s}$, &
                             a=v, lda=size(v,1), &
@@ -150,21 +135,41 @@ contains
 
     contains
 
-        subroutine upper_triangular_solve(h, g, y, n)
+        subroutine apply_givens_rotation(hcol, c, s)
+            ! implementation inspired by https://github.com/nekStab/LightKrylov
+            ${t}$, target, contiguous, intent(inout) :: hcol(:)
+            ${t}$, intent(inout) :: c(:), s(:)
+            integer :: k
+            ${t}$ :: r
+            ${t}$, pointer :: hmat(:, :)
+            ! Size of the column.
+            k = int(size(hcol) - 1, kind=ilp)
+            ! Apply previous Givens rotations to this new column.
+            hmat(1:k, 1:1) => hcol(:k)
+            call lasr('L', 'V', 'F', k, 1_ilp, c(:k-1), s(:k-1), hmat, k)
+            ! Compute the sine and cosine components for the next rotation.
+            call lartg(hcol(k), hcol(k+1), c(k), s(k), r)
+            ! Eliminate H(k+1, k).
+            hcol(k) = r
+            hcol(k+1) = zero_${s}$
+        end subroutine
+
+        subroutine upper_triangular_solve(h, g, y, n, info)
             ${t}$, intent(in) :: h(:,:), g(:)
-            ${t}$, intent(inout) :: y(:)
+            ${t}$, target, contiguous, intent(inout) :: y(:)
             integer, intent(in) :: n
-            integer :: row
+            integer, intent(out) :: info
+            integer(ilp) :: n_, lda_
+            ${t}$, pointer :: rhs(:, :)
 
             y(1:n) = g(1:n)
-            do row = n, 1, -1
-                if (row < n) y(row) = y(row) - dot_product(h(row,row+1:n) , y(row+1:n))
-                if (abs(h(row,row)) > epsilon(one_${s}$)) then
-                    y(row) = y(row) / h(row,row)
-                else
-                    y(row) = zero_${s}$
-                end if
-            end do
+
+            n_ = int(n, kind=ilp)
+            lda_ = int(size(h,1), kind=ilp)
+
+            rhs(1:n,1:1) => y(:n)
+
+            call trtrs('U','N','N', n_, 1_ilp, h, lda_, rhs, n_, info)
         end subroutine
     end subroutine
     #:endfor

From 84b920a9f80d0a0d606a5f677e1abf37956868b5 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sat, 20 Jun 2026 13:35:25 +0200
Subject: [PATCH 23/23] try removing the hilbert test for sp as it is
 meaningless for the selected size

---
 test/linalg/test_linalg_solve_iterative.fypp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/linalg/test_linalg_solve_iterative.fypp b/test/linalg/test_linalg_solve_iterative.fypp
index e3c47b32c..7e4187f86 100644
--- a/test/linalg/test_linalg_solve_iterative.fypp
+++ b/test/linalg/test_linalg_solve_iterative.fypp
@@ -150,13 +150,9 @@ module test_linalg_solve_iterative
         integer, parameter :: n = 10
         integer :: r, c
         #:for k, t, s in R_KINDS_TYPES
+        #:if k not in ['sp'] # Hilbert sp is almost useless. for n=10, cond(A) ~ 10^13, which is beyond the precision of single precision.
         block
-            !Hilbert sp is almost useless. 
-            #:if k == 'sp' 
-            ${t}$, parameter :: tol = 1.e-0_${k}$
-            #:else
             ${t}$, parameter :: tol = 1.e-3_${k}$
-            #:endif
             
             ${t}$ :: A(n, n), b(n), x(n)
             ${t}$, parameter :: x_ref(n) = 1.0_${k}$
@@ -174,6 +170,7 @@ module test_linalg_solve_iterative
             'error in GMRES ${k}$: Hilbert matrix')
             if (allocated(error)) return
         end block
+        #:endif
         #:endfor
     end subroutine test_stdlib_solve_gmres_hilbert