Merge pull request #609 from denizyuret/dy/data

Fix #606 gcnode issue
denizyuret · Aug 28, 2020 · b720020 · b720020 · denizyuret · Aug 28, 2020
2 parents 2754cd6 + 6c92456
commit b720020
Show file tree

Hide file tree

Showing 51 changed files with 1,285 additions and 278 deletions.
diff --git a/Artifacts.toml b/Artifacts.toml
@@ -1,11 +1,11 @@
 [[libknet8]]
 arch = "x86_64"
-git-tree-sha1 = "5e1e317677e88277f0ee67ab9e17587a8edc4f7a"
+git-tree-sha1 = "172aab1c490da4e9f26ed08c3b8e99cd86e8d7e7"
 os = "linux"
 
  [[libknet8.download]]
- sha256 = "2ef57e8bf25eb00597345bacc2ba2b4cb6182b3ace46dfbd6b6d2981e6998764"
- url = "https://github.com/denizyuret/Knet.jl/releases/download/v1.3.8/libknet8.x86_64-linux-gnu.tar.gz"
+ sha256 = "84a33826b8dd45451852be25bdc84a070f9bc418b09ab093cce5682abb54377a"
+ url = "https://github.com/denizyuret/Knet.jl/releases/download/v1.4.0/libknet8.x86_64-linux-gnu.tar.gz"
 
 [[libknet8]]
 arch = "x86_64"

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Knet"
 uuid = "1902f260-5fb4-5aff-8c31-6271790ab950"
 authors = ["Deniz Yuret <[email protected]>"]
-version = "1.4.0"
+version = "1.4.1"
 
 [deps]
 AutoGrad = "6710c13c-97f1-543f-91c5-74e8f7d95b35"

diff --git a/README.md b/README.md
@@ -47,8 +47,8 @@ Here is a simple example where we define, train and test the
 using 15 lines of code and 10 seconds of GPU computation.
 
 ```julia
-# Install packages before first run: using Pkg; pkg"add Knet IterTools CodecZlib"
-using Knet, IterTools, CodecZlib
+# Install packages before first run: using Pkg; pkg"add Knet IterTools MLDatasets"
+using Knet, IterTools, MLDatasets
 
 # Define convolutional layer:
 struct Conv; w; b; end
@@ -66,13 +66,15 @@ struct Chain; layers; end
 (c::Chain)(x,y) = nll(c(x),y)
 
 # Load MNIST data:
-include(Knet.dir("data","mnist.jl"))
-dtrn, dtst = mnistdata()
+xtrn,ytrn = MNIST.traindata(Float32); ytrn[ytrn.==0] .= 10
+xtst,ytst = MNIST.testdata(Float32); ytst[ytst.==0] .= 10
+dtrn = minibatch(xtrn, ytrn, 100; xsize = (28,28,1,:))
+dtst = minibatch(xtst, ytst, 100; xsize = (28,28,1,:))
 
 # Define and train LeNet (~10 secs on a GPU or ~3 mins on a CPU to reach ~99% accuracy)
 LeNet = Chain((Conv(5,5,1,20), Conv(5,5,20,50), Dense(800,500,f=relu), Dense(500,10)))
 progress!(adam(LeNet, ncycle(dtrn,3)))
-accuracy(LeNet,dtst)
+accuracy(LeNet,data=dtst)
 ```
 
 ## Contributing
@@ -81,24 +83,4 @@ Knet is an open-source project and we are always open to new contributions: bug
 fixes, feature requests and contributions, new machine learning models and operators,
 inspiring examples, benchmarking results are all welcome. See [Tips for Developers](https://denizyuret.github.io/Knet.jl/latest/install/#Tips-for-developers) for instructions.
 
-Current contributors:
-
- * Can Gümeli
- * Carlo Lucibello
- * Ekin Akyürek
- * Ekrem Emre Yurdakul
- * Emre Ünal
- * Emre Yolcu
- * Enis Berk
- * Erenay Dayanık
- * İlker Kesen
- * Kai Xu
- * Meriç Melike Softa
- * Mike Innes
- * Onur Kuru
- * Ozan Arkan Can
- * Ömer Kırnap
- * Phuoc Nguyen
- * Rene Donner
- * Tim Besard
- * Zhang Shiwei
+Contributors: Can Gümeli, Carlo Lucibello, Ekin Akyürek, Ekrem Emre Yurdakul, Emre Ünal, Emre Yolcu, Enis Berk, Erenay Dayanık, İlker Kesen, Kai Xu, Meriç Melike Softa, Mike Innes, Onur Kuru, Ozan Arkan Can, Ömer Kırnap, Phuoc Nguyen, Rene Donner, Tim Besard, Zhang Shiwei.
diff --git a/src/Knet.jl b/src/Knet.jl
@@ -1,28 +1,41 @@
 module Knet
 
+"Construct a path relative to Knet root, e.g. Knet.dir(\"examples\") => \"~/.julia/dev/Knet/examples\""
+dir(path...)=joinpath(dirname(@__DIR__),path...)
+
+"Default array and element type used by Knet, override by setting Knet.atype() or Knet.array_type[]"
+atype() = array_type[]
+atype(x) = convert(atype(),x)
+const array_type = Ref{Type}(Array{Float32})
+
 include("libknet8/LibKnet8.jl")
 include("knetarrays/KnetArrays.jl")
 include("cuarrays/CuArrays.jl")
 include("autograd_gpu/AutoGrad_gpu.jl")
 include("ops20/Ops20.jl")
 include("ops20_gpu/Ops20_gpu.jl")
+include("ops21/Ops21.jl")
+include("ops21_gpu/Ops21_gpu.jl")
 include("fileio_gpu/FileIO_gpu.jl")
 include("train20/Train20.jl")
+# include("layers21/Layers21.jl")
 
 # See if we have a gpu at initialization:
 import AutoGrad, CUDA
 function __init__()
  if CUDA.functional()
- Knet.Train20.array_type[] = Knet.KnetArrays.KnetArray{Float32}
+ if isempty(Knet.LibKnet8.libknet8)
+ @warn "libknet8 library not found, some GPU functionality may not be available, try reinstalling Knet."
+ end
+ Knet.array_type[] = Knet.KnetArrays.KnetArray{Float32}
  AutoGrad.set_gc_function(Knet.KnetArrays.cuallocator[] ? Knet.AutoGrad_gpu.gcnode : Knet.AutoGrad_gpu.knetgcnode)
- mem(d) = (CUDA.device!(d); m = CUDA.available_memory(); CUDA.device_reset!(); m)
- CUDA.device!(argmax(Dict(d=>mem(d) for d in CUDA.devices())))
+ if CUDA.has_nvml() # Pick the device with highest memory
+ mem(d) = CUDA.NVML.memory_info(CUDA.NVML.Device(CUDA.uuid(d))).free
+ CUDA.device!(argmax(Dict(d=>mem(d) for d in CUDA.devices())))
+ end
  end
 end
 
-"Construct a path relative to Knet root, e.g. Knet.dir(\"examples\") => \"~/.julia/dev/Knet/examples\""
-dir(path...)=joinpath(dirname(@__DIR__),path...)
-
 # Match export list with v1.3.9 for backward compatibility
 using AutoGrad #: @diff, AutoGrad, Param, cat1d, grad, gradloss, params, value
 using Knet.LibKnet8 #: libknet8, @knet8, @knet8r, gpu

diff --git a/src/autograd_gpu/AutoGrad_gpu.jl b/src/autograd_gpu/AutoGrad_gpu.jl
@@ -7,12 +7,12 @@ import CUDA, Knet, AutoGrad
 using CUDA: CuArray, CuPtr, functional
 using Knet.KnetArrays: DevArray, KnetArray, Cptr, cuallocator
 using Knet.LibKnet8: @knet8
+using Knet.CuArrays: cuarrays
 using AutoGrad: AutoGrad, Sparse, recording, Result, Node, Tape, Value, Arg, value, set_gc_function
 using Base.Broadcast: Broadcasted
 
 include("addto.jl")
 include("convert.jl")
-include("cuarrays.jl")
 include("getindex.jl")
 include("sparse.jl")
 

diff --git a/src/autograd_gpu/gcnode.jl b/src/autograd_gpu/gcnode.jl
@@ -1,4 +1,7 @@
-using CUDA: CuArray
+using CUDA: CuArray, unsafe_free!
+# Most of the time is spent in cuarrays
+using Knet.CuArrays: cuarrays
+using Knet.KnetArrays: cuallocator
 using AutoGrad: Result, Node, Tape
 
 # During the back pass we want to make pointers available as soon as we can to save memory
@@ -15,65 +18,84 @@ using AutoGrad: Result, Node, Tape
 
 # The gcnode_queue maps CuArrays to the first index on tape they have a reference to. We use
 # Base.Order.Reverse because we want to free the pointers with highest indices first.
-# Using WeakRef to allow garbage collection.
-const gcnode_queue = PriorityQueue{WeakRef,Int}(Base.Order.Reverse)
+# Using ObjectId to allow fast hashing, CuArrays are hashed slow like Arrays.
+const gcnode_queue = PriorityQueue{UInt,Int}(Base.Order.Reverse)
 
+# To call unsafe_free we need to find which CuArray belongs to an objectid.
+# Using WeakRef on CuArray values for to allow garbage collection
+const gcnode_dict = Dict{UInt,WeakRef}()
+
+# We use node indices on the tape to use as values in the priority queue
+# Using ObjectId(::Node) for keys just in case
+const gcnode_index = Dict{UInt,Int}()
+
+# Reset everything if Tape changes:
+gcnode_tape = WeakRef(nothing)
+
 # During the backward step parents of a node (who have lower indices) may have their
 # outgrads modified, thus new CuArray references may appear. We want to keep the smallest
 # index for each CuArray. 
-function gcnode_minidx!(q::PriorityQueue{WeakRef,Int,typeof(Base.Order.Reverse)},k::CuArray,v::Int)
- if v < get(q,k,typemax(Int)); q[WeakRef(k)]=v; end ## 0.190μs
+function gcnode_setindex!(c::CuArray,v::Int)
+ cid = objectid(c)
+ get!(gcnode_dict, cid) do; WeakRef(c); end
+ if v < get(gcnode_queue,cid,typemax(Int))
+ gcnode_queue[cid] = v
+ end
 end
 
-const gcnode_index = WeakKeyDict{Node,Int}()
-gcnode_tape = WeakRef(nothing)
-
-function gcnode_init(tape::Tape) ## 2.35ms
- global gcnode_tape, gcnode_index, gcnode_queue
- gcnode_tape = WeakRef(tape)
+function gcnode_init(tape::Tape)
+ global gcnode_tape = WeakRef(tape)
  empty!(gcnode_index)
  empty!(gcnode_queue)
+ empty!(gcnode_dict)
  tape isa Tape || return
  @inbounds for (i,n) in enumerate(tape.list)
- gcnode_index[n] = i
+ gcnode_index[objectid(n)] = i
  if n.Value isa Result
- for k in cuarrays(n.Value.value); gcnode_queue[WeakRef(k)] = 0; end # pointers with index 0 will never get gc'ed
- for k in cuarrays(n.outgrad); get!(gcnode_queue,WeakRef(k),i); end # this only sets gcnode_queue[k] if it does not have a value
+ for c in cuarrays(n.Value.value); gcnode_setindex!(c,0); end # pointers with index 0 will never get gc'ed
+ for c in cuarrays(n.outgrad);  gcnode_setindex!(c,i); end # this only sets gcnode_queue[c] if it was not seen
  else # n.Value isa Param
- for k in cuarrays(n.Value.value); gcnode_queue[WeakRef(k)] = 0; end
- for k in cuarrays(n.outgrad); gcnode_queue[WeakRef(k)] = 0; end
+ for c in cuarrays(n.Value.value); gcnode_setindex!(c,0); end
+ for c in cuarrays(n.outgrad); gcnode_setindex!(c,0); end
  end
  end
 end
 
 function gcnode(n::Node, tape::Tape) ## 16.3μs
- global gcnode_tape, gcnode_index, gcnode_queue
- tape !== gcnode_tape.value && gcnode_init(tape) ## 2μs amortized
+ cuallocator[] || return knetgcnode(n,tape)
+ if tape !== gcnode_tape.value
+ gcnode_init(tape)
+ end
  tape isa Tape || return
- ni = gcnode_index[n]
+ ni = gcnode_index[objectid(n)]
  if n.Value isa Result # && n.outgrad isa KnetArray
- for ptr in cuarrays(n.outgrad); gcnode_minidx!(gcnode_queue, ptr, ni); end
+ for c in cuarrays(n.outgrad); gcnode_setindex!(c, ni); end
  end
  @inbounds for i in 1:length(n.parents); ## 2.43μs
  isassigned(n.parents, i) || continue
  parent = n.parents[i]
  if parent.Value isa Result
- pi = gcnode_index[parent]
- for ptr in cuarrays(parent.outgrad); gcnode_minidx!(gcnode_queue, ptr, pi); end
+ pi = gcnode_index[objectid(parent)]
+ for c in cuarrays(parent.outgrad); gcnode_setindex!(c, pi); end
  else
- for ptr in cuarrays(parent.outgrad); gcnode_queue[WeakRef(ptr)] = 0; end # protect Params
+ for c in cuarrays(parent.outgrad); gcnode_setindex!(c,0); end # protect Params
  end
  end
  while !isempty(gcnode_queue) && peek(gcnode_queue)[2] >= ni ## 5.62μs
- (k,v) = dequeue_pair!(gcnode_queue) ## 0.787μs
- k = k.value
- if v != ni; @warn("k=$((k.ptr,k.len)) v=$v ni=$ni", maxlog=1); end ## 0.160μs
- #DBG verifypointer(tape, ni, k) 
- unsafe_free!(k) ## 4.06μs
+ (cid,v) = dequeue_pair!(gcnode_queue) ## 0.787μs
+ c = gcnode_dict[cid].value
+ if v == ni
+ unsafe_free!(c) ## 4.06μs
+ else
+ @warn("gcnode error: c=$(summary(c)) v=$v ni=$ni", maxlog=1) ## 0.160μs
+ end
  end
  if n.Value isa Result
  n.Value, n.outgrad = gcnode_null, nothing
  end
 end
 
 const gcnode_null = Result{Nothing}(nothing,nothing,nothing,nothing)
+
+
+
diff --git a/src/autograd_gpu/gcnode_kptr.jl b/src/autograd_gpu/gcnode_kptr.jl
@@ -1,4 +1,4 @@
-using Knet.KnetArrays: KnetPtr, KnetArray, freeKnetPtr
+using Knet.KnetArrays: KnetPtr, KnetArray, freeKnetPtr, cuallocator
 using AutoGrad: Result, Node, Tape
 
 # During the back pass we want to make pointers available as soon as we can to save memory
@@ -50,6 +50,7 @@ function knetgcinit(tape::Tape) ## 2.35ms
 end
 
 function knetgcnode(n::Node, tape::Tape) ## 16.3μs
+ # cuallocator[] && return gcnode(n,tape) ## this works with both allocators
  global _tape, _index, _queue
  tape !== _tape.value && knetgcinit(tape) ## 2μs amortized
  tape isa Tape || return

diff --git a/src/cuarrays/CuArrays.jl b/src/cuarrays/CuArrays.jl
@@ -8,5 +8,6 @@ using Knet.KnetArrays: checkbetween
 include("convert.jl")
 include("getindex.jl")
 include("reduction.jl")
+include("cubytes.jl"); export cuarrays, cubytes
 
 end
diff --git a/src/autograd_gpu/cuarrays.jl → src/cuarrays/cubytes.jl b/src/autograd_gpu/cuarrays.jl → src/cuarrays/cubytes.jl
@@ -5,30 +5,30 @@ using CUDA: CuArray
 
 cuarrays(x, c=CuArray[], d=IdDict{Any,Bool}()) = (_cuarrays(x,c,d); c)
 
-_cuarrays(x::Tuple, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
- for xi in x; _cuarrays(xi, c, d); end
+_cuarrays(x::CuArray, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
+ if !haskey(d,x); d[x] = true; push!(c,x); x.parent === nothing || _cuarrays(x.parent,c,d); end
 
 _cuarrays(x::Union{Module,String,Symbol,Core.MethodInstance,Method,GlobalRef,DataType,Union,UnionAll,Task,Regex},
  c::Vector{CuArray}, d::IdDict{Any,Bool}) = return
 
+_cuarrays(x::Tuple, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
+ for xi in x; _cuarrays(xi, c, d); end
+
 _cuarrays(x::Core.SimpleVector, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
  if !haskey(d,x); d[x] = true; for xi in x; _cuarrays(xi, c, d); end; end
 
-_cuarrays(x::Array, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
- if !haskey(d,x); d[x] = true; _cuarrays_array_t(x, eltype(x), c, d); end
-
 _cuarrays(x::Union{Dict,IdDict}, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
  if !haskey(d,x); d[x] = true; for (k,v) in x; _cuarrays(k, c, d); _cuarrays(v, c, d); end; end
 
-function _cuarrays_array_t(@nospecialize(x), T, c::Vector{CuArray}, d::IdDict{Any,Bool})
- if isbitstype(T)
- return
- end
- for i = 1:(length(x)::Int)
- if ccall(:jl_array_isassigned, Cint, (Any, Csize_t), x, i-1) != 0
- xi = ccall(:jl_arrayref, Any, (Any, Csize_t), x, i-1)
- if !isbits(xi)
- _cuarrays(xi, c, d)
+function _cuarrays(x::Array{T}, c::Vector{CuArray}, d::IdDict{Any,Bool}) where T
+ if !isbitstype(T) && !haskey(d,x)
+ d[x] = true
+  for i = 1:(length(x)::Int)
+  if ccall(:jl_array_isassigned, Cint, (Any, Csize_t), x, i-1) != 0
+  xi = ccall(:jl_arrayref, Any, (Any, Csize_t), x, i-1)
+  if !isbits(xi)
+  _cuarrays(xi, c, d)
+ end
  end
  end
  end
@@ -44,9 +44,6 @@ function _cuarrays(@nospecialize(x), c::Vector{CuArray}, d::IdDict{Any,Bool})
  if T.mutable
  d[x] = true
  end
- if T === CuArray
- push!(c, x)
- end
  for i in 1:nf
  if isdefined(x,i)
  _cuarrays(getfield(x,i), c, d)