module athena__optimiser
  !! Module containing implementations of optimisation methods
  !!
  !! This module implements gradient-based optimisers for training neural networks
  !! by minimizing loss functions through iterative parameter updates.
  !!
  !! Implemented optimisers:
  !!
  !! SGD (Stochastic Gradient Descent):
  !!   θ_{t+1} = θ_t - η * ∇L(θ_t)
  !!   Simple, reliable baseline optimiser
  !!
  !! SGD with Momentum:
  !!   v_{t+1} = μ*v_t + ∇L(θ_t)
  !!   θ_{t+1} = θ_t - η * v_{t+1}
  !!   Accelerates convergence, dampens oscillations
  !!
  !! RMSProp:
  !!   s_{t+1} = β*s_t + (1-β)*[∇L(θ_t)]²
  !!   θ_{t+1} = θ_t - η * ∇L(θ_t) / sqrt(s_{t+1} + ε)
  !!   Adapts learning rate per parameter, good for non-stationary objectives
  !!
  !! Adagrad:
  !!   s_{t+1} = s_t + [∇L(θ_t)]²
  !!   θ_{t+1} = θ_t - η * ∇L(θ_t) / sqrt(s_{t+1} + ε)
  !!   Adapts learning rate based on historical gradients
  !!
  !! Adam (Adaptive Moment Estimation):
  !!   m_{t+1} = β₁*m_t + (1-β₁)*∇L(θ_t)          [first moment]
  !!   v_{t+1} = β₂*v_t + (1-β₂)*[∇L(θ_t)]²       [second moment]
  !!   m̂ = m_{t+1}/(1-β₁^t),  v̂ = v_{t+1}/(1-β₂^t)  [bias correction]
  !!   θ_{t+1} = θ_t - η * m̂ / (sqrt(v̂) + ε)
  !!   Combines momentum and adaptive learning rates, most popular choice
  !!
  !! L-BFGS (Limited-memory BFGS):
  !!   Quasi-Newton method approximating Hessian inverse
  !!   Good for small-medium sized problems, smooth objectives
  !!
  !! where η is learning rate, μ is momentum, β/β₁/β₂ are decay rates
  !!
  !! Attribution statement:
  !! The following module is based on code from the neural-fortran library
  !! https://github.com/modern-fortran/neural-fortran
  !! The implementation of optimiser_base_type, sgd_optimiser_type,
  !! rmsprop_optimiser_type, adagrad_optimiser_type, and adam_optimiser_type
  !! are based on the corresponding types from neural-fortran
  use coreutils, only: real32, stop_program
  use athena__clipper, only: clip_type
  use athena__regulariser, only: base_regulariser_type, l2_regulariser_type
  use athena__learning_rate_decay, only: base_lr_decay_type
  implicit none


  private

  public :: base_optimiser_type
  public :: sgd_optimiser_type
  public :: rmsprop_optimiser_type
  public :: adagrad_optimiser_type
  public :: adam_optimiser_type


!-------------------------------------------------------------------------------

  type :: base_optimiser_type
     !! Base optimiser type
     character(len=20) :: name
     !! Name of the optimiser
     integer :: iter = 0
     !! Iteration number
     integer :: epoch = 0
     !! Epoch number
     real(real32) :: learning_rate = 0.01_real32
     !! Learning rate hyperparameter
     logical :: regularisation = .false.
     !! Apply regularisation
     class(base_regulariser_type), allocatable :: regulariser
     !! Regularisation method
     class(base_lr_decay_type), allocatable :: lr_decay
     !! Learning rate decay method
     type(clip_type) :: clip_dict
     !! Clipping dictionary
   contains
     procedure, pass(this) :: init => init_base
     !! Initialise base optimiser
     procedure, pass(this) :: print_to_unit => print_to_unit_base
     !! Print base optimiser information
     procedure, pass(this) :: read => read_base
     !! Read base optimiser information
     procedure, pass(this) :: init_gradients => init_gradients_base
     !! Initialise gradients
     procedure, pass(this) :: minimise => minimise_base
     !! Apply gradients to parameters to minimise loss using base optimiser
  end type base_optimiser_type

  interface base_optimiser_type
     !! Interface for setting up the base optimiser
     module function optimiser_setup_base( &
          learning_rate, num_params, &
          regulariser, clip_dict, lr_decay) result(optimiser)
       !! Set up the base optimiser
       real(real32), optional, intent(in) :: learning_rate
       !! Learning rate
       integer, optional, intent(in) :: num_params
       !! Number of parameters
       class(base_regulariser_type), optional, intent(in) :: regulariser
       !! Regularisation method
       type(clip_type), optional, intent(in) :: clip_dict
       !! Clipping dictionary
       class(base_lr_decay_type), optional, intent(in) :: lr_decay
       !! Learning rate decay method
       type(base_optimiser_type) :: optimiser
       !! Instance of the base optimiser
     end function optimiser_setup_base
  end interface base_optimiser_type

!-------------------------------------------------------------------------------

  type, extends(base_optimiser_type) :: sgd_optimiser_type
     !! Stochastic gradient descent optimiser type
     logical :: nesterov = .false.
     !! Nesterov momentum
     real(real32) :: momentum = 0._real32
     !! Fraction of momentum-based learning
     real(real32), allocatable, dimension(:) :: velocity
     !! Velocity for momentum
   contains
     procedure, pass(this) :: init_gradients => init_gradients_sgd
     !! Initialise gradients for SGD
     procedure, pass(this) :: minimise => minimise_sgd
     !! Apply gradients to parameters to minimise loss using SGD optimiser
  end type sgd_optimiser_type

  interface sgd_optimiser_type
     !! Interface for setting up the SGD optimiser
     module function optimiser_setup_sgd( &
          learning_rate, momentum, &
          nesterov, num_params, &
          regulariser, clip_dict, lr_decay) result(optimiser)
       !! Set up the SGD optimiser
       real(real32), optional, intent(in) :: learning_rate, momentum
       !! Learning rate and momentum
       logical, optional, intent(in) :: nesterov
       !! Nesterov momentum
       integer, optional, intent(in) :: num_params
       !! Number of parameters
       class(base_regulariser_type), optional, intent(in) :: regulariser
       !! Regularisation method
       type(clip_type), optional, intent(in) :: clip_dict
       !! Clipping dictionary
       class(base_lr_decay_type), optional, intent(in) :: lr_decay
       !! Learning rate decay method
       type(sgd_optimiser_type) :: optimiser
       !! Instance of the SGD optimiser
     end function optimiser_setup_sgd
  end interface sgd_optimiser_type

!-------------------------------------------------------------------------------

  type, extends(base_optimiser_type) :: rmsprop_optimiser_type
     !! RMSprop optimiser type
     real(real32) :: beta = 0._real32
     !! Beta parameter
     real(real32) :: epsilon = 1.E-8_real32
     !! Epsilon parameter
     real(real32), allocatable, dimension(:) :: moving_avg
     !! Moving average
   contains
     procedure, pass(this) :: init_gradients => init_gradients_rmsprop
     !! Initialise gradients for RMSprop
     procedure, pass(this) :: minimise => minimise_rmsprop
     !! Apply gradients to parameters to minimise loss using RMSprop optimiser
  end type rmsprop_optimiser_type

  interface rmsprop_optimiser_type
     !! Interface for setting up the RMSprop optimiser
     module function optimiser_setup_rmsprop( &
          learning_rate, beta, &
          epsilon, num_params, &
          regulariser, clip_dict, lr_decay) result(optimiser)
       !! Set up the RMSprop optimiser
       real(real32), optional, intent(in) :: learning_rate, beta, epsilon
       !! Learning rate, beta, and epsilon
       integer, optional, intent(in) :: num_params
       !! Number of parameters
       class(base_regulariser_type), optional, intent(in) :: regulariser
       !! Regularisation method
       type(clip_type), optional, intent(in) :: clip_dict
       !! Clipping dictionary
       class(base_lr_decay_type), optional, intent(in) :: lr_decay
       !! Learning rate decay method
       type(rmsprop_optimiser_type) :: optimiser
       !! Instance of the RMSprop optimiser
     end function optimiser_setup_rmsprop
  end interface rmsprop_optimiser_type

!-------------------------------------------------------------------------------

  type, extends(base_optimiser_type) :: adagrad_optimiser_type
     !! Adagrad optimiser type
     real(real32) :: epsilon = 1.E-8_real32
     !! Epsilon parameter
     real(real32), allocatable, dimension(:) :: sum_squares
     !! Sum of squares of gradients
   contains
     procedure, pass(this) :: init_gradients => init_gradients_adagrad
     !! Initialise gradients for Adagrad
     procedure, pass(this) :: minimise => minimise_adagrad
     !! Apply gradients to parameters to minimise loss using Adagrad optimiser
  end type adagrad_optimiser_type

  interface adagrad_optimiser_type
     !! Interface for setting up the Adagrad optimiser
     module function optimiser_setup_adagrad( &
          learning_rate, &
          epsilon, num_params, &
          regulariser, clip_dict, lr_decay) result(optimiser)
       !! Set up the Adagrad optimiser
       real(real32), optional, intent(in) :: learning_rate, epsilon
       !! Learning rate and epsilon
       integer, optional, intent(in) :: num_params
       !! Number of parameters
       class(base_regulariser_type), optional, intent(in) :: regulariser
       !! Regularisation method
       type(clip_type), optional, intent(in) :: clip_dict
       !! Clipping dictionary
       class(base_lr_decay_type), optional, intent(in) :: lr_decay
       !! Learning rate decay method
       type(adagrad_optimiser_type) :: optimiser
       !! Instance of the Adagrad optimiser
     end function optimiser_setup_adagrad
  end interface adagrad_optimiser_type

!-------------------------------------------------------------------------------

  type, extends(base_optimiser_type) :: adam_optimiser_type
     !! Adam optimiser type
     real(real32) :: beta1 = 0.9_real32
     !! Beta1 parameter
     real(real32) :: beta2 = 0.999_real32
     !! Beta2 parameter
     real(real32) :: epsilon = 1.E-8_real32
     !! Epsilon parameter
     real(real32), allocatable, dimension(:) :: m
     !! First moment estimate
     real(real32), allocatable, dimension(:) :: v
     !! Second moment estimate
   contains
     procedure, pass(this) :: init_gradients => init_gradients_adam
     !! Initialise gradients for Adam
     procedure, pass(this) :: minimise => minimise_adam
     !! Apply gradients to parameters to minimise loss using Adam optimiser
  end type adam_optimiser_type

  interface adam_optimiser_type
     !! Interface for setting up the Adam optimiser
     module function optimiser_setup_adam( &
          learning_rate, &
          beta1, beta2, epsilon, &
          num_params, &
          regulariser, clip_dict, lr_decay) result(optimiser)
       !! Set up the Adam optimiser
       real(real32), optional, intent(in) :: learning_rate
       !! Learning rate
       real(real32), optional, intent(in) :: beta1, beta2, epsilon
       !! Beta1, beta2, and epsilon
       integer, optional, intent(in) :: num_params
       !! Number of parameters
       class(base_regulariser_type), optional, intent(in) :: regulariser
       !! Regularisation method
       type(clip_type), optional, intent(in) :: clip_dict
       !! Clipping dictionary
       class(base_lr_decay_type), optional, intent(in) :: lr_decay
       !! Learning rate decay method
       type(adam_optimiser_type) :: optimiser
       !! Instance of the Adam optimiser
     end function optimiser_setup_adam
  end interface adam_optimiser_type


contains

!###############################################################################
  module function optimiser_setup_base( &
       learning_rate, num_params, &
       regulariser, clip_dict, lr_decay &
  ) result(optimiser)
    !! Set up the base optimiser
    implicit none

    ! Arguments
    real(real32), optional, intent(in) :: learning_rate
    !! Learning rate
    integer, optional, intent(in) :: num_params
    !! Number of parameters
    class(base_regulariser_type), optional, intent(in) :: regulariser
    !! Regularisation method
    type(clip_type), optional, intent(in) :: clip_dict
    !! Clipping dictionary
    class(base_lr_decay_type), optional, intent(in) :: lr_decay
    !! Learning rate decay method

    type(base_optimiser_type) :: optimiser
    !! Instance of the base optimiser

    ! Local variables
    integer :: num_params_
    !! Number of parameters


    ! Initialise optimiser name
    optimiser%name = "base"

    ! Apply regularisation
    if(present(regulariser))then
       optimiser%regularisation = .true.
       if(allocated(optimiser%regulariser)) deallocate(optimiser%regulariser)
       allocate(optimiser%regulariser, source = regulariser)
    end if

    ! Apply clipping
    if(present(clip_dict)) optimiser%clip_dict = clip_dict

    ! Initialise general optimiser parameters
    if(present(learning_rate)) optimiser%learning_rate = learning_rate

    ! Initialise learning rate decay
    if(present(lr_decay))then
       if(allocated(optimiser%lr_decay)) deallocate(optimiser%lr_decay)
       allocate(optimiser%lr_decay, source = lr_decay)
    else
       allocate(optimiser%lr_decay, source = base_lr_decay_type())
    end if

    ! Initialise gradients
    if(present(num_params))then
       num_params_ = num_params
    else
       num_params_ = 1
    end if
    call optimiser%init_gradients(num_params_)
  end function optimiser_setup_base
!###############################################################################


!###############################################################################
  subroutine init_base(this, num_params, regulariser, clip_dict)
    !! Initialise base optimiser
    implicit none

    ! Arguments
    class(base_optimiser_type), intent(inout) :: this
    !! Instance of the base optimiser
    integer, intent(in) :: num_params
    !! Number of parameters
    class(base_regulariser_type), optional, intent(in) :: regulariser
    !! Regularisation method
    type(clip_type), optional, intent(in) :: clip_dict
    !! Clipping dictionary


    ! Apply regularisation
    if(present(regulariser))then
       this%regularisation = .true.
       if(allocated(this%regulariser)) deallocate(this%regulariser)
       allocate(this%regulariser, source = regulariser)
    end if

    ! Apply clipping
    if(present(clip_dict)) this%clip_dict = clip_dict

    ! Initialise gradients
    call this%init_gradients(num_params)
  end subroutine init_base
!###############################################################################


!###############################################################################
  pure subroutine init_gradients_base(this, num_params)
    !! Initialise gradients for base optimiser
    implicit none

    ! Arguments
    class(base_optimiser_type), intent(inout) :: this
    !! Instance of the base optimiser
    integer, intent(in) :: num_params
    !! Number of parameters

    !allocate(this%velocity(num_params), source=0._real32)
  end subroutine init_gradients_base
!###############################################################################


!###############################################################################
  pure subroutine minimise_base(this, param, gradient)
    !! Apply gradients to parameters to minimise loss using base optimiser
    implicit none

    ! Arguments
    class(base_optimiser_type), intent(inout) :: this
    !! Instance of the base optimiser
    real(real32), dimension(:), intent(inout) :: param
    !! Parameters
    real(real32), dimension(:), intent(inout) :: gradient
    !! Gradients

    ! Local variables
    real(real32) :: learning_rate
    !! Learning rate


    ! Decay learning rate and update iteration
    learning_rate = this%lr_decay%get_lr(this%learning_rate, this%iter)

    ! Update parameters
    param = param - learning_rate * gradient
  end subroutine minimise_base
!###############################################################################


!##############################################################################!
! * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * !
!##############################################################################!


!###############################################################################
  subroutine print_to_unit_base(this, unit)
    !! Print base optimiser information
    implicit none

    ! Arguments
    class(base_optimiser_type), intent(in) :: this
    !! Instance of the base optimiser
    integer, intent(in) :: unit
    !! File unit


    write(unit,'(6X,"NAME = ",A)') this%name
    write(unit,'(6X,"LEARNING_RATE = ",F10.5)') this%learning_rate
    write(unit,'(6X,"ITERATION = ",I10)') this%iter
    write(unit,'(6X,"EPOCH = ",I10)') this%epoch
    write(unit,'(6X,"REGULARISATION = ",L1)') this%regularisation

  end subroutine print_to_unit_base
!###############################################################################


!###############################################################################
  subroutine read_base(this, unit)
    !! Read base optimiser information
    use athena__tools_infile, only: assign_val, assign_vec
    use coreutils, only: to_lower, to_upper, icount
    implicit none

    ! Arguments
    class(base_optimiser_type), intent(inout) :: this
    !! Instance of the base optimiser
    integer, intent(in) :: unit
    !! File unit

    ! Local variables
    integer :: stat
    !! File status
    integer :: itmp1
    !! Temporary integer
    character(256) :: buffer, tag, err_msg
    !! Buffer for reading lines, tag for identifying lines, error message


    ! Loop over tags in layer card
    !---------------------------------------------------------------------------
    tag_loop: do

       ! Check for end of file
       !------------------------------------------------------------------------
       read(unit,'(A)',iostat=stat) buffer
       if(stat.ne.0)then
          write(err_msg,'("file encountered error (EoF?) before END ",A)') &
               to_upper(this%name)
          call stop_program(err_msg)
          return
       end if
       if(trim(adjustl(buffer)).eq."") cycle tag_loop

       ! Check for end of layer card
       !------------------------------------------------------------------------
       if(trim(adjustl(buffer)).eq."END OPTIMISER")then
          backspace(unit)
          exit tag_loop
       end if

       tag=trim(adjustl(buffer))
       if(scan(buffer,"=").ne.0) tag=trim(tag(:scan(tag,"=")-1))

       ! Read parameters from save file
       !------------------------------------------------------------------------
       select case(trim(tag))
       case("NAME")
          call assign_val(buffer, this%name, itmp1)
       case("LEARNING_RATE")
          call assign_val(buffer, this%learning_rate, itmp1)
       case("ITERATION")
          call assign_val(buffer, this%iter, itmp1)
       case("EPOCH")
          call assign_val(buffer, this%epoch, itmp1)
       case("REGULARISATION")
          call assign_val(buffer, this%regularisation, itmp1)
       case default
          ! Don't look for "e" due to scientific notation of numbers
          ! ... i.e. exponent (E+00)
          if(scan(to_lower(trim(adjustl(buffer))),&
               'abcdfghijklmnopqrstuvwxyz').eq.0)then
             cycle tag_loop
          elseif(tag(:3).eq.'END')then
             cycle tag_loop
          end if
          write(err_msg,'("Unrecognised line in input file: ",A)') &
               trim(adjustl(buffer))
          call stop_program(err_msg)
          return
       end select
    end do tag_loop


    ! Check for end of layer card
    !---------------------------------------------------------------------------
    read(unit,'(A)') buffer
    if(trim(adjustl(buffer)).ne."END OPTIMISER")then
       write(0,*) trim(adjustl(buffer))
       write(err_msg,'("END OPTIMISER not where expected")')
       call stop_program(err_msg)
       return
    end if

  end subroutine read_base
!################################################################################


!##############################################################################!
! * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * !
!##############################################################################!


!###############################################################################
  module function optimiser_setup_sgd( &
       learning_rate, momentum, &
       nesterov, num_params, &
       regulariser, clip_dict, lr_decay) result(optimiser)
    !! Set up the SGD optimiser
    implicit none

    ! Arguments
    real(real32), optional, intent(in) :: learning_rate, momentum
    !! Learning rate and momentum
    logical, optional, intent(in) :: nesterov
    !! Nesterov momentum
    integer, optional, intent(in) :: num_params
    !! Number of parameters
    class(base_regulariser_type), optional, intent(in) :: regulariser
    !! Regularisation method
    type(clip_type), optional, intent(in) :: clip_dict
    !! Clipping dictionary
    class(base_lr_decay_type), optional, intent(in) :: lr_decay
    !! Learning rate decay method

    type(sgd_optimiser_type) :: optimiser
    !! Instance of the SGD optimiser

    ! Local variables
    integer :: num_params_
    !! Number of parameters


    ! Initialise optimiser name
    optimiser%name = "sgd"

    ! Apply regularisation
    if(present(regulariser))then
       optimiser%regularisation = .true.
       if(allocated(optimiser%regulariser)) deallocate(optimiser%regulariser)
       allocate(optimiser%regulariser, source = regulariser)
    end if

    ! Apply clipping
    if(present(clip_dict)) optimiser%clip_dict = clip_dict

    ! Initialise general optimiser parameters
    if(present(learning_rate)) optimiser%learning_rate = learning_rate
    if(present(momentum)) optimiser%momentum = momentum

    ! Initialise learning rate decay
    if(present(lr_decay))then
       if(allocated(optimiser%lr_decay)) deallocate(optimiser%lr_decay)
       allocate(optimiser%lr_decay, source = lr_decay)
    else
       allocate(optimiser%lr_decay, source = base_lr_decay_type())
    end if

    ! Initialise nesterov boolean
    if(present(nesterov)) optimiser%nesterov = nesterov

    ! Initialise gradients
    if(present(num_params))then
       num_params_ = num_params
    else
       num_params_ = 1
    end if
    call optimiser%init_gradients(num_params_)
  end function optimiser_setup_sgd
!###############################################################################


!###############################################################################
  pure subroutine init_gradients_sgd(this, num_params)
    !! Initialise gradients for SGD optimiser
    implicit none

    ! Arguments
    class(sgd_optimiser_type), intent(inout) :: this
    !! Instance of the SGD optimiser
    integer, intent(in) :: num_params
    !! Number of parameters


    ! Initialise gradients
    if(allocated(this%velocity)) deallocate(this%velocity)
    allocate(this%velocity(num_params), source=0._real32)
  end subroutine init_gradients_sgd
!###############################################################################


!###############################################################################
  pure subroutine minimise_sgd(this, param, gradient)
    !! Apply gradients to parameters to minimise loss using SGD optimiser
    implicit none

    ! Arguments
    class(sgd_optimiser_type), intent(inout) :: this
    !! Instance of the SGD optimiser
    real(real32), dimension(:), intent(inout) :: param
    !! Parameters
    real(real32), dimension(:), intent(inout) :: gradient
    !! Gradients

    ! Local variables
    real(real32) :: learning_rate
    !! Learning rate


    ! Decay learning rate and update iteration
    learning_rate = this%lr_decay%get_lr(this%learning_rate, this%iter)

    ! Apply regularisation
    if(this%regularisation) &
         call this%regulariser%regularise( param, gradient, learning_rate )

    gradient = - learning_rate * gradient
    ! Update parameters
    if(this%momentum.gt.1.E-8_real32)then
       !! Adaptive learning method
       this%velocity = this%momentum * this%velocity + gradient
       if(this%nesterov)then
          param = param + this%momentum * this%velocity + gradient
       else
          param = param + this%velocity
       end if
    else
       ! Standard learning method
       this%velocity = gradient
       param = param + this%velocity
    end if
  end subroutine minimise_sgd
!###############################################################################


!##############################################################################!
! * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * !
!##############################################################################!


!###############################################################################
  module function optimiser_setup_rmsprop( &
       learning_rate, beta, epsilon, &
       num_params, regulariser, clip_dict, lr_decay &
  ) result(optimiser)
    !! Set up the RMSprop optimiser
    implicit none

    ! Arguments
    real(real32), optional, intent(in) :: learning_rate
    !! Learning rate
    real(real32), optional, intent(in) :: beta, epsilon
    !! Beta and epsilon
    integer, optional, intent(in) :: num_params
    !! Number of parameters
    class(base_regulariser_type), optional, intent(in) :: regulariser
    !! Regularisation method
    type(clip_type), optional, intent(in) :: clip_dict
    !! Clipping dictionary
    class(base_lr_decay_type), optional, intent(in) :: lr_decay
    !! Learning rate decay method

    type(rmsprop_optimiser_type) :: optimiser
    !! Instance of the RMSprop optimiser

    ! Local variables
    integer :: num_params_
    !! Number of parameters


    ! Initialise optimiser name
    optimiser%name = "rmsprop"

    ! Apply regularisation
    if(present(regulariser))then
       optimiser%regularisation = .true.
       if(allocated(optimiser%regulariser)) deallocate(optimiser%regulariser)
       allocate(optimiser%regulariser, source = regulariser)
    end if

    ! Apply clipping
    if(present(clip_dict)) optimiser%clip_dict = clip_dict

    ! Initialise general optimiser parameters
    if(present(learning_rate)) optimiser%learning_rate = learning_rate

    ! Initialise learning rate decay
    if(present(lr_decay))then
       if(allocated(optimiser%lr_decay)) deallocate(optimiser%lr_decay)
       allocate(optimiser%lr_decay, source = lr_decay)
    else
       allocate(optimiser%lr_decay, source = base_lr_decay_type())
    end if

    ! Initialise RMSprop parameters
    if(present(beta)) optimiser%beta = beta
    if(present(epsilon)) optimiser%epsilon = epsilon

    ! Initialise gradients
    if(present(num_params))then
       num_params_ = num_params
    else
       num_params_ = 1
    end if
    call optimiser%init_gradients(num_params_)
  end function optimiser_setup_rmsprop
!###############################################################################


!###############################################################################
  pure subroutine init_gradients_rmsprop(this, num_params)
    !! Initialise gradients for RMSprop optimiser
    implicit none

    ! Arguments
    class(rmsprop_optimiser_type), intent(inout) :: this
    !! Instance of the RMSprop optimiser
    integer, intent(in) :: num_params
    !! Number of parameters


    ! Initialise gradients
    if(allocated(this%moving_avg)) deallocate(this%moving_avg)
    allocate(this%moving_avg(num_params), source=0._real32)
  end subroutine init_gradients_rmsprop
!###############################################################################


!###############################################################################
  pure subroutine minimise_rmsprop(this, param, gradient)
    !! Apply gradients to parameters to minimise loss using RMSprop optimiser
    implicit none

    ! Arguments
    class(rmsprop_optimiser_type), intent(inout) :: this
    !! Instance of the RMSprop optimiser
    real(real32), dimension(:), intent(inout) :: param
    !! Parameters
    real(real32), dimension(:), intent(inout) :: gradient
    !! Gradients

    ! Local variables
    real(real32) :: learning_rate
    !! Learning rate


    ! Decay learning rate and update iteration
    learning_rate = this%lr_decay%get_lr(this%learning_rate, this%iter)

    ! Apply regularisation
    if(this%regularisation) &
         call this%regulariser%regularise( param, gradient, learning_rate )

    this%moving_avg = this%beta * this%moving_avg + &
         (1._real32 - this%beta) * gradient ** 2._real32

    ! Update parameters
    param = param - learning_rate * gradient / &
         (sqrt(this%moving_avg + this%epsilon))
  end subroutine minimise_rmsprop
!###############################################################################


!##############################################################################!
! * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * !
!##############################################################################!


!###############################################################################
  module function optimiser_setup_adagrad( &
       learning_rate, epsilon, &
       num_params, regulariser, clip_dict, lr_decay &
  ) result(optimiser)
    !! Set up the Adagrad optimiser
    implicit none

    ! Arguments
    real(real32), optional, intent(in) :: learning_rate
    !! Learning rate
    real(real32), optional, intent(in) :: epsilon
    !! Epsilon
    integer, optional, intent(in) :: num_params
    !! Number of parameters
    class(base_regulariser_type), optional, intent(in) :: regulariser
    !! Regularisation method
    type(clip_type), optional, intent(in) :: clip_dict
    !! Clipping dictionary
    class(base_lr_decay_type), optional, intent(in) :: lr_decay
    !! Learning rate decay method

    type(adagrad_optimiser_type) :: optimiser
    !! Instance of the Adagrad optimiser

    ! Local variables
    integer :: num_params_
    !! Number of parameters


    ! Initialise optimiser name
    optimiser%name = "adagrad"

    ! Apply regularisation
    if(present(regulariser))then
       optimiser%regularisation = .true.
       if(allocated(optimiser%regulariser)) deallocate(optimiser%regulariser)
       allocate(optimiser%regulariser, source = regulariser)
    end if

    ! Apply clipping
    if(present(clip_dict)) optimiser%clip_dict = clip_dict

    ! Initialise general optimiser parameters
    if(present(learning_rate)) optimiser%learning_rate = learning_rate

    ! Initialise learning rate decay
    if(present(lr_decay))then
       if(allocated(optimiser%lr_decay)) deallocate(optimiser%lr_decay)
       allocate(optimiser%lr_decay, source = lr_decay)
    else
       allocate(optimiser%lr_decay, source = base_lr_decay_type())
    end if

    ! Initialise Adagrad parameters
    if(present(epsilon)) optimiser%epsilon = epsilon

    ! Initialise gradients
    if(present(num_params))then
       num_params_ = num_params
    else
       num_params_ = 1
    end if
    call optimiser%init_gradients(num_params_)
  end function optimiser_setup_adagrad
!###############################################################################


!###############################################################################
  pure subroutine init_gradients_adagrad(this, num_params)
    !! Initialise gradients for Adagrad optimiser
    implicit none

    ! Arguments
    class(adagrad_optimiser_type), intent(inout) :: this
    !! Instance of the Adagrad optimiser
    integer, intent(in) :: num_params
    !! Number of parameters


    ! Initialise gradients
    if(allocated(this%sum_squares)) deallocate(this%sum_squares)
    allocate(this%sum_squares(num_params), source=0._real32)
  end subroutine init_gradients_adagrad
!###############################################################################


!###############################################################################
  pure subroutine minimise_adagrad(this, param, gradient)
    !! Apply gradients to parameters to minimise loss using Adagrad optimiser
    implicit none

    ! Arguments
    class(adagrad_optimiser_type), intent(inout) :: this
    !! Instance of the Adagrad optimiser
    real(real32), dimension(:), intent(inout) :: param
    !! Parameters
    real(real32), dimension(:), intent(inout) :: gradient
    !! Gradients

    real(real32) :: learning_rate
    !! Learning rate


    ! Decay learning rate and update iteration
    learning_rate = this%lr_decay%get_lr(this%learning_rate, this%iter)

    ! Apply regularisation
    if(this%regularisation) &
         call this%regulariser%regularise( param, gradient, learning_rate )

    this%sum_squares = this%sum_squares + gradient ** 2._real32

    ! Update parameters
    param = param - learning_rate * gradient / &
         (sqrt(this%sum_squares + this%epsilon))
  end subroutine minimise_adagrad
!###############################################################################


!##############################################################################!
! * * * * * * * * * * * * * * * * * * *  * * * * * * * * * * * * * * * * * * * !
!##############################################################################!


!###############################################################################
  module function optimiser_setup_adam( &
       learning_rate, beta1, beta2, epsilon, &
       num_params, regulariser, clip_dict, lr_decay &
  ) result(optimiser)
    !! Set up the Adam optimiser
    implicit none

    ! Arguments
    real(real32), optional, intent(in) :: learning_rate
    !! Learning rate
    real(real32), optional, intent(in) :: beta1, beta2, epsilon
    !! Beta1, beta2, and epsilon
    integer, optional, intent(in) :: num_params
    !! Number of parameters
    class(base_regulariser_type), optional, intent(in) :: regulariser
    !! Regularisation method
    type(clip_type), optional, intent(in) :: clip_dict
    !! Clipping dictionary
    class(base_lr_decay_type), optional, intent(in) :: lr_decay
    !! Learning rate decay method

    type(adam_optimiser_type) :: optimiser
    !! Instance of the Adam optimiser

    ! Local variables
    integer :: num_params_
    !! Number of parameters


    ! Initialise optimiser name
    optimiser%name = "adam"

    ! Apply regularisation
    if(present(regulariser))then
       optimiser%regularisation = .true.
       if(allocated(optimiser%regulariser)) deallocate(optimiser%regulariser)
       allocate(optimiser%regulariser, source = regulariser)
    end if

    ! Apply clipping
    if(present(clip_dict)) optimiser%clip_dict = clip_dict

    ! Initialise general optimiser parameters
    if(present(learning_rate)) optimiser%learning_rate = learning_rate

    ! Initialise learning rate decay
    if(present(lr_decay))then
       if(allocated(optimiser%lr_decay)) deallocate(optimiser%lr_decay)
       allocate(optimiser%lr_decay, source = lr_decay)
    else
       allocate(optimiser%lr_decay, source = base_lr_decay_type())
    end if

    ! Initialise Adam parameters
    if(present(beta1)) optimiser%beta1 = beta1
    if(present(beta2)) optimiser%beta2 = beta2
    if(present(epsilon)) optimiser%epsilon = epsilon

    ! Initialise gradients
    if(present(num_params))then
       num_params_ = num_params
    else
       num_params_ = 1
    end if
    call optimiser%init_gradients(num_params_)
  end function optimiser_setup_adam
!###############################################################################


!###############################################################################
  pure subroutine init_gradients_adam(this, num_params)
    !! Initialise gradients for Adam optimiser
    implicit none

    ! Arguments
    class(adam_optimiser_type), intent(inout) :: this
    !! Instance of the Adam optimiser
    integer, intent(in) :: num_params
    !! Number of parameters


    ! Initialise gradients
    if(allocated(this%m)) deallocate(this%m)
    if(allocated(this%v)) deallocate(this%v)
    allocate(this%m(num_params), source=0._real32)
    allocate(this%v(num_params), source=0._real32)
  end subroutine init_gradients_adam
!###############################################################################


!###############################################################################
  pure subroutine minimise_adam(this, param, gradient)
    !! Apply gradients to parameters to minimise loss using Adam optimiser
    implicit none

    ! Arguments
    class(adam_optimiser_type), intent(inout) :: this
    !! Instance of the Adam optimiser
    real(real32), dimension(:), intent(inout) :: param
    !! Parameters
    real(real32), dimension(:), intent(inout) :: gradient
    !! Gradients

    ! Local variables
    real(real32) :: learning_rate, bias_correction1, bias_correction2
    !! Learning rate


    ! Decay learning rate and update iteration
    learning_rate = this%lr_decay%get_lr(this%learning_rate, this%iter)

    ! Apply regularisation
    if(this%regularisation) &
         call this%regulariser%regularise( param, gradient, learning_rate )

    ! Adaptive learning method
    this%m = this%beta1 * this%m + &
         (1._real32 - this%beta1) * gradient
    this%v = this%beta2 * this%v + &
         (1._real32 - this%beta2) * gradient * gradient

    ! Bias corrections
    bias_correction1 = 1._real32 - this%beta1**this%iter
    bias_correction2 = 1._real32 - this%beta2**this%iter

    ! Update parameters
    associate( &
         m_hat => this%m / bias_correction1, &
         v_hat => this%v / bias_correction2 )
       if(this%regularisation .and. allocated(this%regulariser))then
          select type(regulariser => this%regulariser)
          type is (l2_regulariser_type)
             select case(regulariser%decoupled)
             case(.true.)
                ! decoupled weight decay (AdamW)
                param = param - learning_rate * regulariser%l2 * param
                param = param - learning_rate * ( m_hat / (sqrt(v_hat) + this%epsilon) )
             case(.false.)
                ! classical L2 regularisation (included in gradient)
                param = param - learning_rate * ( &
                     ( m_hat + regulariser%l2 * param ) / &
                     ( sqrt(v_hat) + this%epsilon ) &
                )
             end select
          class default
             ! unknown regulariser — fall back to standard Adam
             param = param - learning_rate * ( &
                  m_hat / (sqrt(v_hat) + this%epsilon) )
          end select
       else
          ! no regularisation — standard Adam
          param = param - learning_rate * ( &
               m_hat / (sqrt(v_hat) + this%epsilon) )
       end if
    end associate
  end subroutine minimise_adam
!###############################################################################

end module athena__optimiser