In-place gradient w.r.t. packed kernel params
Accumulate gradients over all edges: d(kappa_e)/dU = V^T @ diag(relu_mask) outer dx d(kappa_e)/db_u = V^T @ diag(relu_mask) dot upstream d(kappa_e)/dV = upstream outer relu(U dx + b_u) d(kappa_e)/db_v = upstream directly
| Type | Intent | Optional | Attributes | Name | ||
|---|---|---|---|---|---|---|
| class(array_type), | intent(in) | :: | this |
Forward result node containing saved operands |
||
| real(kind=real32), | intent(in), | dimension(:,:) | :: | upstream_grad |
Upstream gradient values |
|
| real(kind=real32), | intent(out), | dimension(:,:) | :: | output |
Output gradient values for packed kernel parameters |
pure subroutine get_partial_gno_kernel_params_val( & this, upstream_grad, output) !! In-place gradient w.r.t. packed kernel params !! !! Accumulate gradients over all edges: !! d(kappa_e)/dU = V^T @ diag(relu_mask) outer dx !! d(kappa_e)/db_u = V^T @ diag(relu_mask) dot upstream !! d(kappa_e)/dV = upstream outer relu(U dx + b_u) !! d(kappa_e)/db_v = upstream directly implicit none ! Arguments class(array_type), intent(in) :: this !! Forward result node containing saved operands real(real32), dimension(:,:), intent(in) :: upstream_grad !! Upstream gradient values real(real32), dimension(:,:), intent(out) :: output !! Output gradient values for packed kernel parameters ! Local variables integer :: d, H, F, num_e, e, k, f_idx !! Unpacked dimensions and loop indices integer :: off_U, off_bu, off_V, off_bv !! Flat offsets for packed kernel parameter blocks real(real32), allocatable :: U(:,:), b_u(:), V(:,:) !! Unpacked kernel parameter tensors real(real32), allocatable :: dx(:), pre_act(:), hidden(:) !! Per-edge buffers for input and activations real(real32), allocatable :: grad_hidden(:) ! [H] !! Hidden-layer gradient buffer d = this%indices(1) H = this%indices(2) F = this%indices(3) * this%indices(4) num_e = size(this%left_operand%val, 2) off_U = 0 off_bu = H * d off_V = off_bu + H off_bv = off_V + F * H allocate(U(H, d)) U = reshape(this%right_operand%val(off_U+1:off_bu, 1), [H, d]) allocate(b_u(H)) b_u = this%right_operand%val(off_bu+1:off_V, 1) allocate(V(F, H)) V = reshape(this%right_operand%val(off_V+1:off_bv, 1), [F, H]) allocate(dx(d), pre_act(H), hidden(H), grad_hidden(H)) output = 0.0_real32 do e = 1, num_e dx = this%left_operand%val(:, e) pre_act = matmul(U, dx) + b_u hidden = max(pre_act, 0.0_real32) ! --- d/d(b_v): upstream_grad(:,e) directly --- output(off_bv+1:, 1) = output(off_bv+1:, 1) + upstream_grad(:, e) ! --- d/dV: upstream outer hidden => grad_V(f,h) += upstream(f,e)*hidden(h) --- do k = 1, H do f_idx = 1, F output(off_V + (k-1)*F + f_idx, 1) = & output(off_V + (k-1)*F + f_idx, 1) + & upstream_grad(f_idx, e) * hidden(k) end do end do ! --- Backprop through relu: grad_hidden = V^T @ upstream(:,e) * relu' --- grad_hidden = matmul(transpose(V), upstream_grad(:, e)) do k = 1, H if(pre_act(k) .le. 0.0_real32) grad_hidden(k) = 0.0_real32 end do ! --- d/d(b_u): grad_hidden directly --- output(off_bu+1:off_V, 1) = output(off_bu+1:off_V, 1) + grad_hidden ! --- d/dU: grad_hidden outer dx => grad_U(h,dd) += grad_hidden(h)*dx(dd) --- do k = 1, d do f_idx = 1, H output(off_U + (k-1)*H + f_idx, 1) = & output(off_U + (k-1)*H + f_idx, 1) + & grad_hidden(f_idx) * dx(k) end do end do end do deallocate(U, b_u, V, dx, pre_act, hidden, grad_hidden) end subroutine get_partial_gno_kernel_params_val