Skip to content

Commit

Permalink
Merge pull request #609 from denizyuret/dy/data
Browse files Browse the repository at this point in the history
Fix #606 gcnode issue
  • Loading branch information
denizyuret committed Aug 28, 2020
2 parents 2754cd6 + 6c92456 commit b720020
Show file tree
Hide file tree
Showing 51 changed files with 1,285 additions and 278 deletions.
6 changes: 3 additions & 3 deletions Artifacts.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
[[libknet8]]
arch = "x86_64"
git-tree-sha1 = "5e1e317677e88277f0ee67ab9e17587a8edc4f7a"
git-tree-sha1 = "172aab1c490da4e9f26ed08c3b8e99cd86e8d7e7"
os = "linux"

[[libknet8.download]]
sha256 = "2ef57e8bf25eb00597345bacc2ba2b4cb6182b3ace46dfbd6b6d2981e6998764"
url = "https://github.com/denizyuret/Knet.jl/releases/download/v1.3.8/libknet8.x86_64-linux-gnu.tar.gz"
sha256 = "84a33826b8dd45451852be25bdc84a070f9bc418b09ab093cce5682abb54377a"
url = "https://github.com/denizyuret/Knet.jl/releases/download/v1.4.0/libknet8.x86_64-linux-gnu.tar.gz"

[[libknet8]]
arch = "x86_64"
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Knet"
uuid = "1902f260-5fb4-5aff-8c31-6271790ab950"
authors = ["Deniz Yuret <[email protected]>"]
version = "1.4.0"
version = "1.4.1"

[deps]
AutoGrad = "6710c13c-97f1-543f-91c5-74e8f7d95b35"
Expand Down
34 changes: 8 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ Here is a simple example where we define, train and test the
using 15 lines of code and 10 seconds of GPU computation.

```julia
# Install packages before first run: using Pkg; pkg"add Knet IterTools CodecZlib"
using Knet, IterTools, CodecZlib
# Install packages before first run: using Pkg; pkg"add Knet IterTools MLDatasets"
using Knet, IterTools, MLDatasets

# Define convolutional layer:
struct Conv; w; b; end
Expand All @@ -66,13 +66,15 @@ struct Chain; layers; end
(c::Chain)(x,y) = nll(c(x),y)

# Load MNIST data:
include(Knet.dir("data","mnist.jl"))
dtrn, dtst = mnistdata()
xtrn,ytrn = MNIST.traindata(Float32); ytrn[ytrn.==0] .= 10
xtst,ytst = MNIST.testdata(Float32); ytst[ytst.==0] .= 10
dtrn = minibatch(xtrn, ytrn, 100; xsize = (28,28,1,:))
dtst = minibatch(xtst, ytst, 100; xsize = (28,28,1,:))

# Define and train LeNet (~10 secs on a GPU or ~3 mins on a CPU to reach ~99% accuracy)
LeNet = Chain((Conv(5,5,1,20), Conv(5,5,20,50), Dense(800,500,f=relu), Dense(500,10)))
progress!(adam(LeNet, ncycle(dtrn,3)))
accuracy(LeNet,dtst)
accuracy(LeNet,data=dtst)
```

## Contributing
Expand All @@ -81,24 +83,4 @@ Knet is an open-source project and we are always open to new contributions: bug
fixes, feature requests and contributions, new machine learning models and operators,
inspiring examples, benchmarking results are all welcome. See [Tips for Developers](https://denizyuret.github.io/Knet.jl/latest/install/#Tips-for-developers) for instructions.

Current contributors:

* Can Gümeli
* Carlo Lucibello
* Ekin Akyürek
* Ekrem Emre Yurdakul
* Emre Ünal
* Emre Yolcu
* Enis Berk
* Erenay Dayanık
* İlker Kesen
* Kai Xu
* Meriç Melike Softa
* Mike Innes
* Onur Kuru
* Ozan Arkan Can
* Ömer Kırnap
* Phuoc Nguyen
* Rene Donner
* Tim Besard
* Zhang Shiwei
Contributors: Can Gümeli, Carlo Lucibello, Ekin Akyürek, Ekrem Emre Yurdakul, Emre Ünal, Emre Yolcu, Enis Berk, Erenay Dayanık, İlker Kesen, Kai Xu, Meriç Melike Softa, Mike Innes, Onur Kuru, Ozan Arkan Can, Ömer Kırnap, Phuoc Nguyen, Rene Donner, Tim Besard, Zhang Shiwei.
25 changes: 19 additions & 6 deletions src/Knet.jl
Original file line number Diff line number Diff line change
@@ -1,28 +1,41 @@
module Knet

"Construct a path relative to Knet root, e.g. Knet.dir(\"examples\") => \"~/.julia/dev/Knet/examples\""
dir(path...)=joinpath(dirname(@__DIR__),path...)

"Default array and element type used by Knet, override by setting Knet.atype() or Knet.array_type[]"
atype() = array_type[]
atype(x) = convert(atype(),x)
const array_type = Ref{Type}(Array{Float32})

include("libknet8/LibKnet8.jl")
include("knetarrays/KnetArrays.jl")
include("cuarrays/CuArrays.jl")
include("autograd_gpu/AutoGrad_gpu.jl")
include("ops20/Ops20.jl")
include("ops20_gpu/Ops20_gpu.jl")
include("ops21/Ops21.jl")
include("ops21_gpu/Ops21_gpu.jl")
include("fileio_gpu/FileIO_gpu.jl")
include("train20/Train20.jl")
# include("layers21/Layers21.jl")

# See if we have a gpu at initialization:
import AutoGrad, CUDA
function __init__()
if CUDA.functional()
Knet.Train20.array_type[] = Knet.KnetArrays.KnetArray{Float32}
if isempty(Knet.LibKnet8.libknet8)
@warn "libknet8 library not found, some GPU functionality may not be available, try reinstalling Knet."
end
Knet.array_type[] = Knet.KnetArrays.KnetArray{Float32}
AutoGrad.set_gc_function(Knet.KnetArrays.cuallocator[] ? Knet.AutoGrad_gpu.gcnode : Knet.AutoGrad_gpu.knetgcnode)
mem(d) = (CUDA.device!(d); m = CUDA.available_memory(); CUDA.device_reset!(); m)
CUDA.device!(argmax(Dict(d=>mem(d) for d in CUDA.devices())))
if CUDA.has_nvml() # Pick the device with highest memory
mem(d) = CUDA.NVML.memory_info(CUDA.NVML.Device(CUDA.uuid(d))).free
CUDA.device!(argmax(Dict(d=>mem(d) for d in CUDA.devices())))
end
end
end

"Construct a path relative to Knet root, e.g. Knet.dir(\"examples\") => \"~/.julia/dev/Knet/examples\""
dir(path...)=joinpath(dirname(@__DIR__),path...)

# Match export list with v1.3.9 for backward compatibility
using AutoGrad #: @diff, AutoGrad, Param, cat1d, grad, gradloss, params, value
using Knet.LibKnet8 #: libknet8, @knet8, @knet8r, gpu
Expand Down
2 changes: 1 addition & 1 deletion src/autograd_gpu/AutoGrad_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ import CUDA, Knet, AutoGrad
using CUDA: CuArray, CuPtr, functional
using Knet.KnetArrays: DevArray, KnetArray, Cptr, cuallocator
using Knet.LibKnet8: @knet8
using Knet.CuArrays: cuarrays
using AutoGrad: AutoGrad, Sparse, recording, Result, Node, Tape, Value, Arg, value, set_gc_function
using Base.Broadcast: Broadcasted

include("addto.jl")
include("convert.jl")
include("cuarrays.jl")
include("getindex.jl")
include("sparse.jl")

Expand Down
78 changes: 50 additions & 28 deletions src/autograd_gpu/gcnode.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
using CUDA: CuArray
using CUDA: CuArray, unsafe_free!
# Most of the time is spent in cuarrays
using Knet.CuArrays: cuarrays
using Knet.KnetArrays: cuallocator
using AutoGrad: Result, Node, Tape

# During the back pass we want to make pointers available as soon as we can to save memory
Expand All @@ -15,65 +18,84 @@ using AutoGrad: Result, Node, Tape

# The gcnode_queue maps CuArrays to the first index on tape they have a reference to. We use
# Base.Order.Reverse because we want to free the pointers with highest indices first.
# Using WeakRef to allow garbage collection.
const gcnode_queue = PriorityQueue{WeakRef,Int}(Base.Order.Reverse)
# Using ObjectId to allow fast hashing, CuArrays are hashed slow like Arrays.
const gcnode_queue = PriorityQueue{UInt,Int}(Base.Order.Reverse)

# To call unsafe_free we need to find which CuArray belongs to an objectid.
# Using WeakRef on CuArray values for to allow garbage collection
const gcnode_dict = Dict{UInt,WeakRef}()

# We use node indices on the tape to use as values in the priority queue
# Using ObjectId(::Node) for keys just in case
const gcnode_index = Dict{UInt,Int}()

# Reset everything if Tape changes:
gcnode_tape = WeakRef(nothing)

# During the backward step parents of a node (who have lower indices) may have their
# outgrads modified, thus new CuArray references may appear. We want to keep the smallest
# index for each CuArray.
function gcnode_minidx!(q::PriorityQueue{WeakRef,Int,typeof(Base.Order.Reverse)},k::CuArray,v::Int)
if v < get(q,k,typemax(Int)); q[WeakRef(k)]=v; end ## 0.190μs
function gcnode_setindex!(c::CuArray,v::Int)
cid = objectid(c)
get!(gcnode_dict, cid) do; WeakRef(c); end
if v < get(gcnode_queue,cid,typemax(Int))
gcnode_queue[cid] = v
end
end

const gcnode_index = WeakKeyDict{Node,Int}()
gcnode_tape = WeakRef(nothing)

function gcnode_init(tape::Tape) ## 2.35ms
global gcnode_tape, gcnode_index, gcnode_queue
gcnode_tape = WeakRef(tape)
function gcnode_init(tape::Tape)
global gcnode_tape = WeakRef(tape)
empty!(gcnode_index)
empty!(gcnode_queue)
empty!(gcnode_dict)
tape isa Tape || return
@inbounds for (i,n) in enumerate(tape.list)
gcnode_index[n] = i
gcnode_index[objectid(n)] = i
if n.Value isa Result
for k in cuarrays(n.Value.value); gcnode_queue[WeakRef(k)] = 0; end # pointers with index 0 will never get gc'ed
for k in cuarrays(n.outgrad); get!(gcnode_queue,WeakRef(k),i); end # this only sets gcnode_queue[k] if it does not have a value
for c in cuarrays(n.Value.value); gcnode_setindex!(c,0); end # pointers with index 0 will never get gc'ed
for c in cuarrays(n.outgrad); gcnode_setindex!(c,i); end # this only sets gcnode_queue[c] if it was not seen
else # n.Value isa Param
for k in cuarrays(n.Value.value); gcnode_queue[WeakRef(k)] = 0; end
for k in cuarrays(n.outgrad); gcnode_queue[WeakRef(k)] = 0; end
for c in cuarrays(n.Value.value); gcnode_setindex!(c,0); end
for c in cuarrays(n.outgrad); gcnode_setindex!(c,0); end
end
end
end

function gcnode(n::Node, tape::Tape) ## 16.3μs
global gcnode_tape, gcnode_index, gcnode_queue
tape !== gcnode_tape.value && gcnode_init(tape) ## 2μs amortized
cuallocator[] || return knetgcnode(n,tape)
if tape !== gcnode_tape.value
gcnode_init(tape)
end
tape isa Tape || return
ni = gcnode_index[n]
ni = gcnode_index[objectid(n)]
if n.Value isa Result # && n.outgrad isa KnetArray
for ptr in cuarrays(n.outgrad); gcnode_minidx!(gcnode_queue, ptr, ni); end
for c in cuarrays(n.outgrad); gcnode_setindex!(c, ni); end
end
@inbounds for i in 1:length(n.parents); ## 2.43μs
isassigned(n.parents, i) || continue
parent = n.parents[i]
if parent.Value isa Result
pi = gcnode_index[parent]
for ptr in cuarrays(parent.outgrad); gcnode_minidx!(gcnode_queue, ptr, pi); end
pi = gcnode_index[objectid(parent)]
for c in cuarrays(parent.outgrad); gcnode_setindex!(c, pi); end
else
for ptr in cuarrays(parent.outgrad); gcnode_queue[WeakRef(ptr)] = 0; end # protect Params
for c in cuarrays(parent.outgrad); gcnode_setindex!(c,0); end # protect Params
end
end
while !isempty(gcnode_queue) && peek(gcnode_queue)[2] >= ni ## 5.62μs
(k,v) = dequeue_pair!(gcnode_queue) ## 0.787μs
k = k.value
if v != ni; @warn("k=$((k.ptr,k.len)) v=$v ni=$ni", maxlog=1); end ## 0.160μs
#DBG verifypointer(tape, ni, k)
unsafe_free!(k) ## 4.06μs
(cid,v) = dequeue_pair!(gcnode_queue) ## 0.787μs
c = gcnode_dict[cid].value
if v == ni
unsafe_free!(c) ## 4.06μs
else
@warn("gcnode error: c=$(summary(c)) v=$v ni=$ni", maxlog=1) ## 0.160μs
end
end
if n.Value isa Result
n.Value, n.outgrad = gcnode_null, nothing
end
end

const gcnode_null = Result{Nothing}(nothing,nothing,nothing,nothing)



3 changes: 2 additions & 1 deletion src/autograd_gpu/gcnode_kptr.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Knet.KnetArrays: KnetPtr, KnetArray, freeKnetPtr
using Knet.KnetArrays: KnetPtr, KnetArray, freeKnetPtr, cuallocator
using AutoGrad: Result, Node, Tape

# During the back pass we want to make pointers available as soon as we can to save memory
Expand Down Expand Up @@ -50,6 +50,7 @@ function knetgcinit(tape::Tape) ## 2.35ms
end

function knetgcnode(n::Node, tape::Tape) ## 16.3μs
# cuallocator[] && return gcnode(n,tape) ## this works with both allocators
global _tape, _index, _queue
tape !== _tape.value && knetgcinit(tape) ## 2μs amortized
tape isa Tape || return
Expand Down
1 change: 1 addition & 0 deletions src/cuarrays/CuArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ using Knet.KnetArrays: checkbetween
include("convert.jl")
include("getindex.jl")
include("reduction.jl")
include("cubytes.jl"); export cuarrays, cubytes

end
31 changes: 14 additions & 17 deletions src/autograd_gpu/cuarrays.jl → src/cuarrays/cubytes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,30 @@ using CUDA: CuArray

cuarrays(x, c=CuArray[], d=IdDict{Any,Bool}()) = (_cuarrays(x,c,d); c)

_cuarrays(x::Tuple, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
for xi in x; _cuarrays(xi, c, d); end
_cuarrays(x::CuArray, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
if !haskey(d,x); d[x] = true; push!(c,x); x.parent === nothing || _cuarrays(x.parent,c,d); end

_cuarrays(x::Union{Module,String,Symbol,Core.MethodInstance,Method,GlobalRef,DataType,Union,UnionAll,Task,Regex},
c::Vector{CuArray}, d::IdDict{Any,Bool}) = return

_cuarrays(x::Tuple, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
for xi in x; _cuarrays(xi, c, d); end

_cuarrays(x::Core.SimpleVector, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
if !haskey(d,x); d[x] = true; for xi in x; _cuarrays(xi, c, d); end; end

_cuarrays(x::Array, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
if !haskey(d,x); d[x] = true; _cuarrays_array_t(x, eltype(x), c, d); end

_cuarrays(x::Union{Dict,IdDict}, c::Vector{CuArray}, d::IdDict{Any,Bool}) =
if !haskey(d,x); d[x] = true; for (k,v) in x; _cuarrays(k, c, d); _cuarrays(v, c, d); end; end

function _cuarrays_array_t(@nospecialize(x), T, c::Vector{CuArray}, d::IdDict{Any,Bool})
if isbitstype(T)
return
end
for i = 1:(length(x)::Int)
if ccall(:jl_array_isassigned, Cint, (Any, Csize_t), x, i-1) != 0
xi = ccall(:jl_arrayref, Any, (Any, Csize_t), x, i-1)
if !isbits(xi)
_cuarrays(xi, c, d)
function _cuarrays(x::Array{T}, c::Vector{CuArray}, d::IdDict{Any,Bool}) where T
if !isbitstype(T) && !haskey(d,x)
d[x] = true
for i = 1:(length(x)::Int)
if ccall(:jl_array_isassigned, Cint, (Any, Csize_t), x, i-1) != 0
xi = ccall(:jl_arrayref, Any, (Any, Csize_t), x, i-1)
if !isbits(xi)
_cuarrays(xi, c, d)
end
end
end
end
Expand All @@ -44,9 +44,6 @@ function _cuarrays(@nospecialize(x), c::Vector{CuArray}, d::IdDict{Any,Bool})
if T.mutable
d[x] = true
end
if T === CuArray
push!(c, x)
end
for i in 1:nf
if isdefined(x,i)
_cuarrays(getfield(x,i), c, d)
Expand Down
Loading

2 comments on commit b720020

@denizyuret
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/20443

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.4.1 -m "<description of version>" b720020869546ea7f46dae5444f50d5bae1bb76c
git push origin v1.4.1

Please sign in to comment.