From 349d99f3a305daffe2ea518a48392c480cdeb637 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 09:54:07 +0200 Subject: [PATCH 01/27] feat: init neighbor loader --- GraphNeuralNetworks/src/samplers.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 GraphNeuralNetworks/src/samplers.jl diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl new file mode 100644 index 000000000..e69de29bb From 07db4d0e7ce3a6ccd4860d8cf5f2380820ef3742 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 09:55:14 +0200 Subject: [PATCH 02/27] feat: init neighbor loader --- src/samplers.jl | 95 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/samplers.jl diff --git a/src/samplers.jl b/src/samplers.jl new file mode 100644 index 000000000..03aea69ef --- /dev/null +++ b/src/samplers.jl @@ -0,0 +1,95 @@ +# Import necessary packages +using GraphNeuralNetworks + +# Define a graph structure (using GraphNeuralNetworks.jl) +struct GNNGraph + graph::Graph # Graph structure from GraphNeuralNetworks.jl + features::Matrix # Feature matrix: rows represent nodes, columns are features +end + +# Define a NeighborLoader structure for sampling neighbors +struct NeighborLoader + graph::GNNGraph # The input GNNGraph (graph + features) + num_neighbors::Int # Number of neighbors to sample per node + batch_size::Int # Number of nodes in each mini-batch + num_layers::Int # Number of layers for neighborhood expansion +end + +# Function to sample neighbors for a given node +function sample_neighbors(loader::NeighborLoader, node::Int) + neighbors = neighbors(loader.graph.graph, node) # Get all neighbors of the node from the graph + num_samples = min(loader.num_neighbors, length(neighbors)) # Choose min between neighbors and required sample size + sampled_neighbors = rand(neighbors, num_samples) # Randomly sample the neighbors + return sampled_neighbors +end + +# Function to create a mini-batch of nodes and their neighbors +function create_mini_batch(loader::NeighborLoader) + # Randomly select batch_size nodes + batch_nodes = rand(1:nv(loader.graph.graph), loader.batch_size) + + # Initialize storage for neighbors and features + batch_neighbors = Dict{Int, Vector{Int}}() # Store sampled neighbors + batch_features = Dict{Int, Vector{Float64}}() # Store node features + + for node in batch_nodes + # Initialize current layer of nodes (starting with the node itself) + sampled_neighbors = [node] + + # For each GNN layer, sample the neighborhood + for layer in 1:loader.num_layers + new_neighbors = [] + for n in sampled_neighbors + neighbors = sample_neighbors(loader, n) # Sample neighbors of current node + append!(new_neighbors, neighbors) + end + sampled_neighbors = unique(new_neighbors) # Update sampled neighbors for next layer + end + + # Store neighbors and features of the node + batch_neighbors[node] = sampled_neighbors + batch_features[node] = loader.graph.features[:, node] # Assuming column-wise features for each node + end + + return batch_nodes, batch_neighbors, batch_features +end + +# Function for training the model with the NeighborLoader +function train_model(graph::GNNGraph, num_neighbors::Int, batch_size::Int, num_layers::Int, num_batches::Int) + # Initialize the NeighborLoader + loader = NeighborLoader(graph, num_neighbors, batch_size, num_layers) + + # Loop through the number of batches for training + for batch in 1:num_batches + batch_nodes, batch_neighbors, batch_features = create_mini_batch(loader) + println("Batch $batch: Nodes: $batch_nodes, Neighbors: $batch_neighbors") + + # Here, you would pass batch data to the GNN model for training + # For example: model(batch_nodes, batch_neighbors, batch_features) + end +end + +# Example of creating a GNN graph and training +function main() + # Sample Graph structure from GraphNeuralNetworks.jl + # Create a small graph with 5 nodes and example edges + graph = Graph(5) + add_edge!(graph, 1, 2) + add_edge!(graph, 1, 3) + add_edge!(graph, 2, 4) + add_edge!(graph, 3, 5) + + # Create random features for the nodes (5 nodes, 3 features per node) + features = rand(3, 5) + + # Create GNNGraph + gnn_graph = GNNGraph(graph, features) + + # Train model using NeighborLoader + train_model(gnn_graph, num_neighbors=2, batch_size=2, num_layers=2, num_batches=3) +end + +# Run the example +main() + +## iterator \ No newline at end of file From 25945c7f7b1d7d159573377a6d4e5e1c58609eb6 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 21:14:57 +0200 Subject: [PATCH 03/27] feat: refine neighborloader --- src/samplers.jl | 157 +++++++++++++++++++++++++++--------------------- 1 file changed, 89 insertions(+), 68 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index 03aea69ef..40cdf05f7 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -1,95 +1,116 @@ -# Import necessary packages using GraphNeuralNetworks -# Define a graph structure (using GraphNeuralNetworks.jl) -struct GNNGraph - graph::Graph # Graph structure from GraphNeuralNetworks.jl - features::Matrix # Feature matrix: rows represent nodes, columns are features -end - # Define a NeighborLoader structure for sampling neighbors struct NeighborLoader - graph::GNNGraph # The input GNNGraph (graph + features) - num_neighbors::Int # Number of neighbors to sample per node - batch_size::Int # Number of nodes in each mini-batch - num_layers::Int # Number of layers for neighborhood expansion + graph::GNNGraph # The input GNNGraph (graph + features from GraphNeuralNetworks.jl) + num_neighbors::Vector{Int} # Number of neighbors to sample per node, for each layer + input_nodes::Vector{Int} # Set of input nodes (starting nodes for sampling) + num_layers::Int # Number of layers for neighborhood expansion + batch_size::Union{Int, Nothing} # Optional batch size, defaults to the length of input_nodes if not given + neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation +end + +# Constructor for NeighborLoader with optional batch size +function NeighborLoader(graph::GNNGraph, num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing} = nothing) + return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) +end + +# Function to get cached neighbors or compute them +function get_neighbors(loader::NeighborLoader, node::Int) + if haskey(loader.neighbors_cache, node) + return loader.neighbors_cache[node] + else + neighbors = neighbors(loader.graph, node) # Get neighbors from graph + loader.neighbors_cache[node] = neighbors + return neighbors + end end -# Function to sample neighbors for a given node -function sample_neighbors(loader::NeighborLoader, node::Int) - neighbors = neighbors(loader.graph.graph, node) # Get all neighbors of the node from the graph - num_samples = min(loader.num_neighbors, length(neighbors)) # Choose min between neighbors and required sample size - sampled_neighbors = rand(neighbors, num_samples) # Randomly sample the neighbors - return sampled_neighbors +# Function to sample neighbors for a given node at a specific layer +function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) + neighbors = get_neighbors(loader, node) + num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer + return rand(neighbors, num_samples) # Randomly sample neighbors end -# Function to create a mini-batch of nodes and their neighbors -function create_mini_batch(loader::NeighborLoader) - # Randomly select batch_size nodes - batch_nodes = rand(1:nv(loader.graph.graph), loader.batch_size) +# Iterator protocol for NeighborLoader with lazy batch loading +function Base.iterate(loader::NeighborLoader, state=1) + if state > length(loader.input_nodes) + return nothing # End of iteration + end - # Initialize storage for neighbors and features - batch_neighbors = Dict{Int, Vector{Int}}() # Store sampled neighbors - batch_features = Dict{Int, Vector{Float64}}() # Store node features + # Determine the size of the current batch + batch_size = min(loader.batch_size, length(loader.input_nodes) - state + 1) + batch_nodes = loader.input_nodes[state:state + batch_size - 1] + + # Set for tracking the subgraph nodes + subgraph_nodes = Set(batch_nodes) + + # Preallocate memory for batch_features + batch_features = loader.graph.x[:, batch_nodes] # Initially just the features of the batch_nodes for node in batch_nodes # Initialize current layer of nodes (starting with the node itself) - sampled_neighbors = [node] + sampled_neighbors = Set([node]) # For each GNN layer, sample the neighborhood for layer in 1:loader.num_layers - new_neighbors = [] + new_neighbors = Set{Int}() for n in sampled_neighbors - neighbors = sample_neighbors(loader, n) # Sample neighbors of current node - append!(new_neighbors, neighbors) + neighbors = sample_neighbors(loader, n, layer) # Sample neighbors of the node for this layer + new_neighbors = union(new_neighbors, neighbors) # Avoid duplicates in the neighbor set end - sampled_neighbors = unique(new_neighbors) # Update sampled neighbors for next layer + sampled_neighbors = new_neighbors + subgraph_nodes = union(subgraph_nodes, sampled_neighbors) # Expand the subgraph with the new neighbors end - - # Store neighbors and features of the node - batch_neighbors[node] = sampled_neighbors - batch_features[node] = loader.graph.features[:, node] # Assuming column-wise features for each node end - return batch_nodes, batch_neighbors, batch_features -end + # Collect subgraph nodes and their features + subgraph_node_list = collect(subgraph_nodes) + subgraph = induced_subgraph(loader.graph, subgraph_node_list) # More efficient subgraph creation + subgraph_features = loader.graph.x[:, subgraph_node_list] -# Function for training the model with the NeighborLoader -function train_model(graph::GNNGraph, num_neighbors::Int, batch_size::Int, num_layers::Int, num_batches::Int) - # Initialize the NeighborLoader - loader = NeighborLoader(graph, num_neighbors, batch_size, num_layers) + # Return a GNNGraph instance for this mini-batch + mini_batch_gnn = GNNGraph(subgraph, subgraph_features) - # Loop through the number of batches for training - for batch in 1:num_batches - batch_nodes, batch_neighbors, batch_features = create_mini_batch(loader) - println("Batch $batch: Nodes: $batch_nodes, Neighbors: $batch_neighbors") - - # Here, you would pass batch data to the GNN model for training - # For example: model(batch_nodes, batch_neighbors, batch_features) - end + # Continue iteration for the next batch + return mini_batch_gnn, state + batch_size end # Example of creating a GNN graph and training -function main() - # Sample Graph structure from GraphNeuralNetworks.jl - # Create a small graph with 5 nodes and example edges - graph = Graph(5) - add_edge!(graph, 1, 2) - add_edge!(graph, 1, 3) - add_edge!(graph, 2, 4) - add_edge!(graph, 3, 5) - - # Create random features for the nodes (5 nodes, 3 features per node) - features = rand(3, 5) - - # Create GNNGraph - gnn_graph = GNNGraph(graph, features) - - # Train model using NeighborLoader - train_model(gnn_graph, num_neighbors=2, batch_size=2, num_layers=2, num_batches=3) -end +# Sample Graph structure from GraphNeuralNetworks.jl +# Create a small graph with 5 nodes and example edges +graph = Graph(5) +add_edge!(graph, 1, 2) +add_edge!(graph, 1, 3) +add_edge!(graph, 2, 4) +add_edge!(graph, 3, 5) -# Run the example -main() +# Create random features for the nodes (5 nodes, 3 features per node) +features = rand(3, 5) -## iterator \ No newline at end of file +# Create GNNGraph (use GNN's GNNGraph) +gnn_graph = GNNGraph(graph, features) + +# Define input nodes (seed nodes) to start sampling +input_nodes = [1, 2, 3, 4, 5] + +# Run the example +# Initialize the NeighborLoader with optional batch_size +loader = NeighborLoader(gnn_graph, [2, 3], input_nodes, 2, 3) # Train without specifying batch_size (defaults to input_nodes size) +loader = NeighborLoader(gnn_graph, [2, 3], input_nodes, 2, 3, 2) # Train with batch_size = 2 + +# Loop through the number of batches for training, using the iterator +batch_counter = 0 +for mini_batch_gnn in loader + batch_counter += 1 + println("Batch $batch_counter: Nodes in mini-batch graph: $(nv(mini_batch_gnn))") + + # Here, you would pass mini_batch_gnn to the GNN model for training + # For example: model(mini_batch_gnn) + + # Stop if num_batches is reached + if batch_counter >= num_batches + break + end +end \ No newline at end of file From 506d4c77e944687185386b424bd72384bde26fbb Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 22:05:27 +0200 Subject: [PATCH 04/27] fix: refine neighborloader --- src/samplers.jl | 99 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 33 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index 40cdf05f7..70b875264 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -7,12 +7,13 @@ struct NeighborLoader input_nodes::Vector{Int} # Set of input nodes (starting nodes for sampling) num_layers::Int # Number of layers for neighborhood expansion batch_size::Union{Int, Nothing} # Optional batch size, defaults to the length of input_nodes if not given + num_batches::Int # Number of batches to process neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation end # Constructor for NeighborLoader with optional batch size -function NeighborLoader(graph::GNNGraph, num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing} = nothing) - return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) +function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing, num_batches::Int) + return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, num_batches, Dict{Int, Vector{Int}}()) end # Function to get cached neighbors or compute them @@ -20,7 +21,9 @@ function get_neighbors(loader::NeighborLoader, node::Int) if haskey(loader.neighbors_cache, node) return loader.neighbors_cache[node] else - neighbors = neighbors(loader.graph, node) # Get neighbors from graph + println(loader.graph) + println("node: ", node) + neighbors = Graph.neighbors(loader.graph, node) # Get neighbors from graph loader.neighbors_cache[node] = neighbors return neighbors end @@ -33,10 +36,32 @@ function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) return rand(neighbors, num_samples) # Randomly sample neighbors end +# Helper function to create a subgraph from selected nodes +function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) + node_set = Set(nodes) # Use a set for quick look-up + + # Collect edges to add + source = Int[] + target = Int[] + for node in nodes + for neighbor in neighbors(graph, node) + if neighbor in node_set + push!(source, node) + push!(target, neighbor) + end + end + end + + # Extract features for the new nodes + new_features = graph.x[:, nodes] + + return GNNGraph(source, target, ndata = new_features) # Return the new GNNGraph with subgraph and features +end + # Iterator protocol for NeighborLoader with lazy batch loading function Base.iterate(loader::NeighborLoader, state=1) - if state > length(loader.input_nodes) - return nothing # End of iteration + if state > length(loader.input_nodes) || (state - 1) // loader.batch_size >= loader.num_batches + return nothing # End of iteration if batches are exhausted end # Determine the size of the current batch @@ -45,14 +70,11 @@ function Base.iterate(loader::NeighborLoader, state=1) # Set for tracking the subgraph nodes subgraph_nodes = Set(batch_nodes) - - # Preallocate memory for batch_features - batch_features = loader.graph.x[:, batch_nodes] # Initially just the features of the batch_nodes for node in batch_nodes # Initialize current layer of nodes (starting with the node itself) sampled_neighbors = Set([node]) - + # For each GNN layer, sample the neighborhood for layer in 1:loader.num_layers new_neighbors = Set{Int}() @@ -67,50 +89,61 @@ function Base.iterate(loader::NeighborLoader, state=1) # Collect subgraph nodes and their features subgraph_node_list = collect(subgraph_nodes) - subgraph = induced_subgraph(loader.graph, subgraph_node_list) # More efficient subgraph creation - subgraph_features = loader.graph.x[:, subgraph_node_list] - - # Return a GNNGraph instance for this mini-batch - mini_batch_gnn = GNNGraph(subgraph, subgraph_features) + mini_batch_gnn = create_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes # Continue iteration for the next batch return mini_batch_gnn, state + batch_size end -# Example of creating a GNN graph and training -# Sample Graph structure from GraphNeuralNetworks.jl -# Create a small graph with 5 nodes and example edges -graph = Graph(5) -add_edge!(graph, 1, 2) -add_edge!(graph, 1, 3) -add_edge!(graph, 2, 4) -add_edge!(graph, 3, 5) - -# Create random features for the nodes (5 nodes, 3 features per node) +# Example +using Graphs +# Example graph +lg = erdos_renyi(5, 0.4) features = rand(3, 5) - -# Create GNNGraph (use GNN's GNNGraph) -gnn_graph = GNNGraph(graph, features) +gnn_graph = GNNGraph(lg, ndata = features) # Define input nodes (seed nodes) to start sampling input_nodes = [1, 2, 3, 4, 5] -# Run the example # Initialize the NeighborLoader with optional batch_size -loader = NeighborLoader(gnn_graph, [2, 3], input_nodes, 2, 3) # Train without specifying batch_size (defaults to input_nodes size) -loader = NeighborLoader(gnn_graph, [2, 3], input_nodes, 2, 3, 2) # Train with batch_size = 2 +loader = NeighborLoader(gnn_graph; num_neighbors = [2, 3], input_nodes=input_nodes, num_layers = 2, batch_size = 3, num_batches = 3) # Loop through the number of batches for training, using the iterator batch_counter = 0 for mini_batch_gnn in loader batch_counter += 1 println("Batch $batch_counter: Nodes in mini-batch graph: $(nv(mini_batch_gnn))") - + # Here, you would pass mini_batch_gnn to the GNN model for training # For example: model(mini_batch_gnn) # Stop if num_batches is reached - if batch_counter >= num_batches + if batch_counter >= loader.num_batches break end -end \ No newline at end of file +end + +using GraphNeuralNetworks, Graphs, SparseArrays + + +# Construct a GNNGraph from from a Graphs.jl's graph +lg = erdos_renyi(10, 30) +g = GNNGraph(lg) + +# Same as above using convenience method rand_graph +g = rand_graph(10, 60) + +# From an adjacency matrix +A = sprand(10, 10, 0.3) +g = GNNGraph(A) + +# From an adjacency list +adjlist = [[2,3], [1,3], [1,2,4], [3]] +g = GNNGraph(adjlist) + +# From COO representation +source = [1,1,2,2,3,3,3,4] +target = [2,3,1,3,1,2,4,3] +g = GNNGraph(source, target) + +Graph.neighbors(g, 1) \ No newline at end of file From 991bf612228054aacbd2ed1ba42ead86eef912e8 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 22:13:17 +0200 Subject: [PATCH 05/27] fix: refine neighborloader --- src/samplers.jl | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index 70b875264..73ba334d3 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -23,7 +23,8 @@ function get_neighbors(loader::NeighborLoader, node::Int) else println(loader.graph) println("node: ", node) - neighbors = Graph.neighbors(loader.graph, node) # Get neighbors from graph + neighbors = Graphs.neighbors(loader.graph, node) # Get neighbors from graph + println("neighbors", neighbors) loader.neighbors_cache[node] = neighbors return neighbors end @@ -31,7 +32,10 @@ end # Function to sample neighbors for a given node at a specific layer function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) + println(loader) + println("node: ", node) neighbors = get_neighbors(loader, node) + println("neigh: ", neighbors) num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer return rand(neighbors, num_samples) # Randomly sample neighbors end @@ -122,28 +126,3 @@ for mini_batch_gnn in loader break end end - -using GraphNeuralNetworks, Graphs, SparseArrays - - -# Construct a GNNGraph from from a Graphs.jl's graph -lg = erdos_renyi(10, 30) -g = GNNGraph(lg) - -# Same as above using convenience method rand_graph -g = rand_graph(10, 60) - -# From an adjacency matrix -A = sprand(10, 10, 0.3) -g = GNNGraph(A) - -# From an adjacency list -adjlist = [[2,3], [1,3], [1,2,4], [3]] -g = GNNGraph(adjlist) - -# From COO representation -source = [1,1,2,2,3,3,3,4] -target = [2,3,1,3,1,2,4,3] -g = GNNGraph(source, target) - -Graph.neighbors(g, 1) \ No newline at end of file From c25bc1e93b8a333890419bbbe6d88cb1647fd0fd Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 22:32:53 +0200 Subject: [PATCH 06/27] fix: refine neighborloader --- src/samplers.jl | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index 73ba334d3..71cfa790c 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -1,4 +1,5 @@ using GraphNeuralNetworks +using Graphs # Define a NeighborLoader structure for sampling neighbors struct NeighborLoader @@ -21,10 +22,7 @@ function get_neighbors(loader::NeighborLoader, node::Int) if haskey(loader.neighbors_cache, node) return loader.neighbors_cache[node] else - println(loader.graph) - println("node: ", node) neighbors = Graphs.neighbors(loader.graph, node) # Get neighbors from graph - println("neighbors", neighbors) loader.neighbors_cache[node] = neighbors return neighbors end @@ -32,12 +30,13 @@ end # Function to sample neighbors for a given node at a specific layer function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) - println(loader) - println("node: ", node) neighbors = get_neighbors(loader, node) - println("neigh: ", neighbors) - num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer - return rand(neighbors, num_samples) # Randomly sample neighbors + if isempty(neighbors) + return Int[] + else + num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer + return rand(neighbors, num_samples) # Randomly sample neighbors + end end # Helper function to create a subgraph from selected nodes @@ -47,8 +46,9 @@ function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) # Collect edges to add source = Int[] target = Int[] + println("nodes: ", nodes) for node in nodes - for neighbor in neighbors(graph, node) + for neighbor in Graphs.neighbors(graph, node, dir = :out) if neighbor in node_set push!(source, node) push!(target, neighbor) @@ -100,11 +100,10 @@ function Base.iterate(loader::NeighborLoader, state=1) end # Example -using Graphs -# Example graph -lg = erdos_renyi(5, 0.4) +source = [1,1,2,2,3,3,3,4,5] +target = [2,3,1,3,1,2,4,3,5] features = rand(3, 5) -gnn_graph = GNNGraph(lg, ndata = features) +gnn_graph = GNNGraph(source, target, ndata = features) # Define input nodes (seed nodes) to start sampling input_nodes = [1, 2, 3, 4, 5] @@ -126,3 +125,5 @@ for mini_batch_gnn in loader break end end + +### TODO: indexes recoding, otherwirse sometimes dimension mismatch with feature matrix \ No newline at end of file From fde10bb4423339b6857989cb808fa40fcbfc322a Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 22:59:36 +0200 Subject: [PATCH 07/27] fix: refine neighborloader --- src/samplers.jl | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index 71cfa790c..de96b4f35 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -22,7 +22,7 @@ function get_neighbors(loader::NeighborLoader, node::Int) if haskey(loader.neighbors_cache, node) return loader.neighbors_cache[node] else - neighbors = Graphs.neighbors(loader.graph, node) # Get neighbors from graph + neighbors = Graphs.neighbors(loader.graph, node, dir = :out) # Get neighbors from graph loader.neighbors_cache[node] = neighbors return neighbors end @@ -41,17 +41,16 @@ end # Helper function to create a subgraph from selected nodes function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) - node_set = Set(nodes) # Use a set for quick look-up + node_map = Dict(node => i for (i, node) in enumerate(nodes)) # Collect edges to add source = Int[] target = Int[] - println("nodes: ", nodes) for node in nodes for neighbor in Graphs.neighbors(graph, node, dir = :out) - if neighbor in node_set - push!(source, node) - push!(target, neighbor) + if neighbor in keys(node_map) + push!(source, node_map[node]) + push!(target, node_map[neighbor]) end end end @@ -125,5 +124,3 @@ for mini_batch_gnn in loader break end end - -### TODO: indexes recoding, otherwirse sometimes dimension mismatch with feature matrix \ No newline at end of file From 3656691eb48c2919c6776ec94bde13b8300643f5 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Wed, 11 Sep 2024 23:01:54 +0200 Subject: [PATCH 08/27] fix: refine neighborloader --- src/samplers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/samplers.jl b/src/samplers.jl index de96b4f35..2a7211dcc 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -114,7 +114,7 @@ loader = NeighborLoader(gnn_graph; num_neighbors = [2, 3], input_nodes=input_nod batch_counter = 0 for mini_batch_gnn in loader batch_counter += 1 - println("Batch $batch_counter: Nodes in mini-batch graph: $(nv(mini_batch_gnn))") + println("Batch $batch_counter: Nodes in mini-batch graph: $(nv(mini_batch_gnn)), $mini_batch_gnn") # Here, you would pass mini_batch_gnn to the GNN model for training # For example: model(mini_batch_gnn) From 9997fabe77fc61d712d3155a2162a60ccf28b4d2 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Thu, 12 Sep 2024 23:21:51 +0200 Subject: [PATCH 09/27] chore: add some comments --- src/samplers.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index 2a7211dcc..a17df98f1 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -64,12 +64,12 @@ end # Iterator protocol for NeighborLoader with lazy batch loading function Base.iterate(loader::NeighborLoader, state=1) if state > length(loader.input_nodes) || (state - 1) // loader.batch_size >= loader.num_batches - return nothing # End of iteration if batches are exhausted + return nothing # End of iteration if batches are exhausted (state larger than amount of input nodes or current batch no >= batch number) end # Determine the size of the current batch - batch_size = min(loader.batch_size, length(loader.input_nodes) - state + 1) - batch_nodes = loader.input_nodes[state:state + batch_size - 1] + batch_size = min(loader.batch_size, length(loader.input_nodes) - state + 1) # Conditional in case there is not enough nodes to fill the last batch + batch_nodes = loader.input_nodes[state:state + batch_size - 1] # Each mini-batch uses different set of input nodes # Set for tracking the subgraph nodes subgraph_nodes = Set(batch_nodes) From acf209cd648d8b3c7a73e9285d1960e7a866bee2 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Tue, 17 Sep 2024 09:53:40 +0200 Subject: [PATCH 10/27] chore: add TODO comments --- src/samplers.jl | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/samplers.jl b/src/samplers.jl index a17df98f1..6f5def815 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -22,7 +22,7 @@ function get_neighbors(loader::NeighborLoader, node::Int) if haskey(loader.neighbors_cache, node) return loader.neighbors_cache[node] else - neighbors = Graphs.neighbors(loader.graph, node, dir = :out) # Get neighbors from graph + neighbors = Graphs.neighbors(loader.graph, node, dir = :in) # Get neighbors from graph loader.neighbors_cache[node] = neighbors return neighbors end @@ -39,7 +39,10 @@ function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) end end -# Helper function to create a subgraph from selected nodes +### TODO check subsample function +### TODO write subsample function for universal use +### TODO factor out to separate PR +# define a method function Graphs.induced_subgraph(g::GNNGraph, nodes) (rename) function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) node_map = Dict(node => i for (i, node) in enumerate(nodes)) @@ -47,7 +50,7 @@ function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) source = Int[] target = Int[] for node in nodes - for neighbor in Graphs.neighbors(graph, node, dir = :out) + for neighbor in Graphs.neighbors(graph, node, dir = :in) if neighbor in keys(node_map) push!(source, node_map[node]) push!(target, node_map[neighbor]) @@ -108,7 +111,8 @@ gnn_graph = GNNGraph(source, target, ndata = features) input_nodes = [1, 2, 3, 4, 5] # Initialize the NeighborLoader with optional batch_size -loader = NeighborLoader(gnn_graph; num_neighbors = [2, 3], input_nodes=input_nodes, num_layers = 2, batch_size = 3, num_batches = 3) +loader = NeighborLoader(gnn_graph; num_neighbors = [0], input_nodes=input_nodes, num_layers = 1, batch_size = 3, num_batches = 3) +# nn = [0], layers =1, n edges should equal 0, no nodes = mini batch # Loop through the number of batches for training, using the iterator batch_counter = 0 @@ -124,3 +128,7 @@ for mini_batch_gnn in loader break end end + +### TODO: only batch size, remove batch num +### TODO: compare pytorch geometric, compare edge cases: batch size = 1, num nodes = 1 +### TODO: think about what tests \ No newline at end of file From 0c4a6535bd775b9233c6c3bb3e89c7e6455319ed Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sat, 28 Sep 2024 12:57:30 +0200 Subject: [PATCH 11/27] feat: add tests, refine code --- .../src/GraphNeuralNetworks.jl | 3 + GraphNeuralNetworks/test/runtests.jl | 1 + GraphNeuralNetworks/test/samplers.jl | 128 ++++++++++++++++++ src/samplers.jl | 68 ++++------ 4 files changed, 157 insertions(+), 43 deletions(-) create mode 100644 GraphNeuralNetworks/test/samplers.jl diff --git a/GraphNeuralNetworks/src/GraphNeuralNetworks.jl b/GraphNeuralNetworks/src/GraphNeuralNetworks.jl index c9a227b8d..ec5d5603b 100644 --- a/GraphNeuralNetworks/src/GraphNeuralNetworks.jl +++ b/GraphNeuralNetworks/src/GraphNeuralNetworks.jl @@ -66,4 +66,7 @@ export GlobalPool, include("deprecations.jl") +include("samplers.jl") +export NeighborLoader + end diff --git a/GraphNeuralNetworks/test/runtests.jl b/GraphNeuralNetworks/test/runtests.jl index 05cb6fd5f..f796651bb 100644 --- a/GraphNeuralNetworks/test/runtests.jl +++ b/GraphNeuralNetworks/test/runtests.jl @@ -30,6 +30,7 @@ tests = [ "layers/temporalconv", "layers/pool", "examples/node_classification_cora", + "samplers" ] !CUDA.functional() && @warn("CUDA unavailable, not testing GPU support") diff --git a/GraphNeuralNetworks/test/samplers.jl b/GraphNeuralNetworks/test/samplers.jl new file mode 100644 index 000000000..f78348610 --- /dev/null +++ b/GraphNeuralNetworks/test/samplers.jl @@ -0,0 +1,128 @@ +using Test +using GraphNeuralNetworks + +# Helper function to create a simple graph with node features using GNNGraph +function create_test_graph() + source = [1, 2, 3, 4] # Define source nodes of edges + target = [2, 3, 4, 5] # Define target nodes of edges + node_features = rand(Float32, 5, 5) # Create random node features (5 features for 5 nodes) + + return GNNGraph(source, target, ndata = node_features) # Create a GNNGraph with edges and features +end + +# Tests for NeighborLoader structure and its functionalities +@testset "NeighborLoader tests" begin + + # 1. Basic functionality: Check neighbor sampling and subgraph creation + @testset "Basic functionality" begin + g = create_test_graph() + + # Define NeighborLoader with 2 neighbors per layer, 2 layers, batch size 2 + loader = NeighborLoader(g; num_neighbors=[2, 2], input_nodes=[1, 2], num_layers=2, batch_size=2) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if the mini-batch graph is not empty + @test !isempty(mini_batch_gnn.graph) + + num_sampled_nodes = mini_batch_gnn.num_nodes + println("Number of nodes in mini-batch: ", num_sampled_nodes) + + @test num_sampled_nodes == 2 + + # Test if there are edges in the subgraph + @test mini_batch_gnn.num_edges > 0 + end + + # 2. Edge case: Single node with no neighbors + @testset "Single node with no neighbors" begin + g = SimpleDiGraph(1) # A graph with a single node and no edges + node_features = rand(Float32, 5, 1) + graph = GNNGraph(g, ndata = node_features) + + loader = NeighborLoader(graph; num_neighbors=[2], input_nodes=[1], num_layers=1) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if the mini-batch graph contains only one node + @test size(mini_batch_gnn.x, 2) == 1 + end + + # 3. Edge case: A node with no outgoing edges (isolated node) + @testset "Node with no outgoing edges" begin + g = SimpleDiGraph(2) # Graph with 2 nodes, no edges + node_features = rand(Float32, 5, 2) + graph = GNNGraph(g, ndata = node_features) + + loader = NeighborLoader(graph; num_neighbors=[1], input_nodes=[1, 2], num_layers=1) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if the mini-batch graph contains the input nodes only (as no neighbors can be sampled) + @test size(mini_batch_gnn.x, 2) == 2 # Only two isolated nodes + end + + # 4. Edge case: A fully connected graph + @testset "Fully connected graph" begin + g = SimpleDiGraph(3) + add_edge!(g, 1, 2) + add_edge!(g, 2, 3) + add_edge!(g, 3, 1) + node_features = rand(Float32, 5, 3) + graph = GNNGraph(g, ndata = node_features) + + loader = NeighborLoader(graph; num_neighbors=[2, 2], input_nodes=[1], num_layers=2) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if all nodes are included in the mini-batch since it's fully connected + @test size(mini_batch_gnn.x, 2) == 3 # All nodes should be included + end + + # 5. Edge case: More layers than the number of neighbors + @testset "More layers than available neighbors" begin + g = SimpleDiGraph(3) + add_edge!(g, 1, 2) + add_edge!(g, 2, 3) + node_features = rand(Float32, 5, 3) + graph = GNNGraph(g, ndata = node_features) + + # Test with 3 layers but only enough connections for 2 layers + loader = NeighborLoader(graph; num_neighbors=[1, 1, 1], input_nodes=[1], num_layers=3) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if the mini-batch graph contains all available nodes + @test size(mini_batch_gnn.x, 2) == 1 + end + + # 6. Edge case: Large batch size greater than the number of input nodes + @testset "Large batch size" begin + g = create_test_graph() + + # Define NeighborLoader with a larger batch size than input nodes + loader = NeighborLoader(g; num_neighbors=[2], input_nodes=[1, 2], num_layers=1, batch_size=10) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if the mini-batch graph is not empty + @test !isempty(mini_batch_gnn.graph) + + # Test if the correct number of nodes are sampled + @test size(mini_batch_gnn.x, 2) == length(unique([1, 2])) # Nodes [1, 2] are expected + end + + # 7. Edge case: No neighbors sampled (num_neighbors = [0]) and 1 layer + @testset "No neighbors sampled" begin + g = create_test_graph() + + # Define NeighborLoader with 0 neighbors per layer, 1 layer, batch size 2 + loader = NeighborLoader(g; num_neighbors=[0], input_nodes=[1, 2], num_layers=1, batch_size=2) + + mini_batch_gnn, next_state = iterate(loader) + + # Test if the mini-batch graph contains only the input nodes + @test size(mini_batch_gnn.x, 2) == 2 # No neighbors should be sampled, only nodes 1 and 2 should be in the graph + end + +end \ No newline at end of file diff --git a/src/samplers.jl b/src/samplers.jl index 6f5def815..4502bc865 100644 --- a/src/samplers.jl +++ b/src/samplers.jl @@ -8,13 +8,12 @@ struct NeighborLoader input_nodes::Vector{Int} # Set of input nodes (starting nodes for sampling) num_layers::Int # Number of layers for neighborhood expansion batch_size::Union{Int, Nothing} # Optional batch size, defaults to the length of input_nodes if not given - num_batches::Int # Number of batches to process neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation end # Constructor for NeighborLoader with optional batch size -function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing, num_batches::Int) - return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, num_batches, Dict{Int, Vector{Int}}()) +function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) + return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) end # Function to get cached neighbors or compute them @@ -39,18 +38,23 @@ function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) end end -### TODO check subsample function -### TODO write subsample function for universal use -### TODO factor out to separate PR -# define a method function Graphs.induced_subgraph(g::GNNGraph, nodes) (rename) -function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) +function induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) + if isempty(nodes) + return GNNGraph() # Return empty graph if no nodes are provided + end + node_map = Dict(node => i for (i, node) in enumerate(nodes)) # Collect edges to add source = Int[] target = Int[] + backup_gnn = GNNGraph() for node in nodes - for neighbor in Graphs.neighbors(graph, node, dir = :in) + neighbors = Graphs.neighbors(graph, node, dir = :in) + if isempty(neighbors) + backup_gnn = add_nodes(backup_gnn, 1) + end + for neighbor in neighbors if neighbor in keys(node_map) push!(source, node_map[node]) push!(target, node_map[neighbor]) @@ -61,12 +65,17 @@ function create_subgraph(graph::GNNGraph, nodes::Vector{Int}) # Extract features for the new nodes new_features = graph.x[:, nodes] + if isempty(source) && isempty(target) + backup_gnn.ndata.x = new_features + return backup_gnn # Return empty graph if no nodes are provided + end + return GNNGraph(source, target, ndata = new_features) # Return the new GNNGraph with subgraph and features end # Iterator protocol for NeighborLoader with lazy batch loading function Base.iterate(loader::NeighborLoader, state=1) - if state > length(loader.input_nodes) || (state - 1) // loader.batch_size >= loader.num_batches + if state > length(loader.input_nodes) return nothing # End of iteration if batches are exhausted (state larger than amount of input nodes or current batch no >= batch number) end @@ -95,40 +104,13 @@ function Base.iterate(loader::NeighborLoader, state=1) # Collect subgraph nodes and their features subgraph_node_list = collect(subgraph_nodes) - mini_batch_gnn = create_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes - - # Continue iteration for the next batch - return mini_batch_gnn, state + batch_size -end -# Example -source = [1,1,2,2,3,3,3,4,5] -target = [2,3,1,3,1,2,4,3,5] -features = rand(3, 5) -gnn_graph = GNNGraph(source, target, ndata = features) - -# Define input nodes (seed nodes) to start sampling -input_nodes = [1, 2, 3, 4, 5] - -# Initialize the NeighborLoader with optional batch_size -loader = NeighborLoader(gnn_graph; num_neighbors = [0], input_nodes=input_nodes, num_layers = 1, batch_size = 3, num_batches = 3) -# nn = [0], layers =1, n edges should equal 0, no nodes = mini batch - -# Loop through the number of batches for training, using the iterator -batch_counter = 0 -for mini_batch_gnn in loader - batch_counter += 1 - println("Batch $batch_counter: Nodes in mini-batch graph: $(nv(mini_batch_gnn)), $mini_batch_gnn") + if isempty(subgraph_node_list) + return GNNGraph(), state + batch_size + end - # Here, you would pass mini_batch_gnn to the GNN model for training - # For example: model(mini_batch_gnn) + mini_batch_gnn = induced_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes - # Stop if num_batches is reached - if batch_counter >= loader.num_batches - break - end + # Continue iteration for the next batch + return mini_batch_gnn, state + batch_size end - -### TODO: only batch size, remove batch num -### TODO: compare pytorch geometric, compare edge cases: batch size = 1, num nodes = 1 -### TODO: think about what tests \ No newline at end of file From 2035c5e53d2601960c7a8b4b193e96ab2bd97a22 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sat, 28 Sep 2024 13:06:05 +0200 Subject: [PATCH 12/27] fix: add samplers.jl after rebase --- GraphNeuralNetworks/src/samplers.jl | 116 +++++++++++++++++++++++++++ GraphNeuralNetworks/test/samplers.jl | 6 +- 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index e69de29bb..44fb3c825 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -0,0 +1,116 @@ +using GraphNeuralNetworks +using Graphs + +# Define a NeighborLoader structure for sampling neighbors +struct NeighborLoader + graph::GNNGraph # The input GNNGraph (graph + features from GraphNeuralNetworks.jl) + num_neighbors::Vector{Int} # Number of neighbors to sample per node, for each layer + input_nodes::Vector{Int} # Set of input nodes (starting nodes for sampling) + num_layers::Int # Number of layers for neighborhood expansion + batch_size::Union{Int, Nothing} # Optional batch size, defaults to the length of input_nodes if not given + neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation +end + +# Constructor for NeighborLoader with optional batch size +function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) + return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) +end + +# Function to get cached neighbors or compute them +function get_neighbors(loader::NeighborLoader, node::Int) + if haskey(loader.neighbors_cache, node) + return loader.neighbors_cache[node] + else + neighbors = Graphs.neighbors(loader.graph, node, dir = :in) # Get neighbors from graph + loader.neighbors_cache[node] = neighbors + return neighbors + end +end + +# Function to sample neighbors for a given node at a specific layer +function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) + neighbors = get_neighbors(loader, node) + if isempty(neighbors) + return Int[] + else + num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer + return rand(neighbors, num_samples) # Randomly sample neighbors + end +end + +function induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) + if isempty(nodes) + return GNNGraph() # Return empty graph if no nodes are provided + end + + node_map = Dict(node => i for (i, node) in enumerate(nodes)) + + # Collect edges to add + source = Int[] + target = Int[] + backup_gnn = GNNGraph() + for node in nodes + neighbors = Graphs.neighbors(graph, node, dir = :in) + if isempty(neighbors) + backup_gnn = add_nodes(backup_gnn, 1) + end + for neighbor in neighbors + if neighbor in keys(node_map) + push!(source, node_map[node]) + push!(target, node_map[neighbor]) + end + end + end + + # Extract features for the new nodes + new_features = graph.x[:, nodes] + + if isempty(source) && isempty(target) + backup_gnn.ndata.x = new_features + return backup_gnn # Return empty graph if no nodes are provided + end + + return GNNGraph(source, target, ndata = new_features) # Return the new GNNGraph with subgraph and features +end + +# Iterator protocol for NeighborLoader with lazy batch loading +function Base.iterate(loader::NeighborLoader, state=1) + if state > length(loader.input_nodes) + return nothing # End of iteration if batches are exhausted (state larger than amount of input nodes or current batch no >= batch number) + end + + # Determine the size of the current batch + batch_size = min(loader.batch_size, length(loader.input_nodes) - state + 1) # Conditional in case there is not enough nodes to fill the last batch + batch_nodes = loader.input_nodes[state:state + batch_size - 1] # Each mini-batch uses different set of input nodes + + # Set for tracking the subgraph nodes + subgraph_nodes = Set(batch_nodes) + + for node in batch_nodes + # Initialize current layer of nodes (starting with the node itself) + sampled_neighbors = Set([node]) + + # For each GNN layer, sample the neighborhood + for layer in 1:loader.num_layers + new_neighbors = Set{Int}() + for n in sampled_neighbors + neighbors = sample_neighbors(loader, n, layer) # Sample neighbors of the node for this layer + new_neighbors = union(new_neighbors, neighbors) # Avoid duplicates in the neighbor set + end + sampled_neighbors = new_neighbors + subgraph_nodes = union(subgraph_nodes, sampled_neighbors) # Expand the subgraph with the new neighbors + end + end + + # Collect subgraph nodes and their features + subgraph_node_list = collect(subgraph_nodes) + + if isempty(subgraph_node_list) + return GNNGraph(), state + batch_size + end + + mini_batch_gnn = induced_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes + + # Continue iteration for the next batch + return mini_batch_gnn, state + batch_size +end \ No newline at end of file diff --git a/GraphNeuralNetworks/test/samplers.jl b/GraphNeuralNetworks/test/samplers.jl index f78348610..5091d14de 100644 --- a/GraphNeuralNetworks/test/samplers.jl +++ b/GraphNeuralNetworks/test/samplers.jl @@ -6,7 +6,7 @@ function create_test_graph() source = [1, 2, 3, 4] # Define source nodes of edges target = [2, 3, 4, 5] # Define target nodes of edges node_features = rand(Float32, 5, 5) # Create random node features (5 features for 5 nodes) - + return GNNGraph(source, target, ndata = node_features) # Create a GNNGraph with edges and features end @@ -27,9 +27,9 @@ end num_sampled_nodes = mini_batch_gnn.num_nodes println("Number of nodes in mini-batch: ", num_sampled_nodes) - + @test num_sampled_nodes == 2 - + # Test if there are edges in the subgraph @test mini_batch_gnn.num_edges > 0 end From ebebce9b247d9513e987d2d54498ff8d9a10b812 Mon Sep 17 00:00:00 2001 From: Agata Skorupka <45850123+askorupka@users.noreply.github.com> Date: Sun, 29 Sep 2024 20:41:26 +0200 Subject: [PATCH 13/27] chore: add docstrings --- GraphNeuralNetworks/src/samplers.jl | 97 ++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 3 deletions(-) diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 44fb3c825..4157b52fc 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -1,7 +1,25 @@ using GraphNeuralNetworks using Graphs -# Define a NeighborLoader structure for sampling neighbors +""" + struct NeighborLoader + +A data structure for sampling neighbors from a graph for training Graph Neural Networks (GNNs). +It supports multi-layer sampling of neighbors for a batch of input nodes, useful for mini-batch training. + +# Fields: +- `graph::GNNGraph`: The input graph containing nodes and edges, along with node features (from GraphNeuralNetworks.jl). +- `num_neighbors::Vector{Int}`: A vector specifying the number of neighbors to sample per node at each GNN layer. +- `input_nodes::Vector{Int}`: A vector containing the starting nodes for neighbor sampling. +- `num_layers::Int`: The number of layers for neighborhood expansion (how far to sample neighbors). +- `batch_size::Union{Int, Nothing}`: The size of the batch. If not specified, it defaults to the number of `input_nodes`. +- `neighbors_cache::Dict{Int, Vector{Int}}`: A cache to store sampled neighbors for each node, preventing redundant sampling. + +# Usage: +```julia +loader = NeighborLoader(graph; num_neighbors=[10, 5], input_nodes=[1, 2, 3], num_layers=2) +``` +""" struct NeighborLoader graph::GNNGraph # The input GNNGraph (graph + features from GraphNeuralNetworks.jl) num_neighbors::Vector{Int} # Number of neighbors to sample per node, for each layer @@ -11,11 +29,40 @@ struct NeighborLoader neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation end -# Constructor for NeighborLoader with optional batch size +### `NeighborLoader` constructor +""" + NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) + +Creates a `NeighborLoader` to sample neighbors from the provided `graph` for the training. + The loader supports batching and multi-layer neighbor sampling. + +# Arguments: +- `graph::GNNGraph`: The input graph with node features. +- `num_neighbors::Vector{Int}`: Number of neighbors to sample per node, per layer. +- `input_nodes::Vector{Int}`: Set of starting nodes for sampling. +- `num_layers::Int`: Number of layers to expand neighborhoods for sampling. +- `batch_size::Union{Int, Nothing}`: Optional batch size. If `nothing`, it defaults to the length of `input_nodes`. + +# Returns: +A `NeighborLoader` object. +""" function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) end +""" + get_neighbors(loader::NeighborLoader, node::Int) -> Vector{Int} + +Returns the neighbors of a given `node` in the graph from the `NeighborLoader`. + It first checks if the neighbors are cached; if not, it retrieves the neighbors from the graph and caches them for future use. + +# Arguments: +- `loader::NeighborLoader`: The `NeighborLoader` instance. +- `node::Int`: The node whose neighbors you want to sample. + +# Returns: +A vector of neighbor node indices. +""" # Function to get cached neighbors or compute them function get_neighbors(loader::NeighborLoader, node::Int) if haskey(loader.neighbors_cache, node) @@ -27,6 +74,20 @@ function get_neighbors(loader::NeighborLoader, node::Int) end end +""" + sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) -> Vector{Int} + +Samples a specified number of neighbors for the given `node` at a particular `layer` of the GNN. + The number of neighbors sampled is defined in `loader.num_neighbors`. + +# Arguments: +- `loader::NeighborLoader`: The `NeighborLoader` instance. +- `node::Int`: The node to sample neighbors for. +- `layer::Int`: The current GNN layer (used to determine how many neighbors to sample). + +# Returns: +A vector of sampled neighbor node indices. +""" # Function to sample neighbors for a given node at a specific layer function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) neighbors = get_neighbors(loader, node) @@ -38,6 +99,20 @@ function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) end end +""" + induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) -> GNNGraph + +Generates a subgraph from the original graph using the provided `nodes`. + The function includes the nodes' neighbors and creates edges between nodes that are connected in the original graph. + If a node has no neighbors, an isolated node will be added to the subgraph. + +# Arguments: +- `graph::GNNGraph`: The original graph containing nodes, edges, and node features. +- `nodes::Vector{Int}`: A vector of node indices to include in the subgraph. + +# Returns: +A new `GNNGraph` containing the subgraph with the specified nodes and their features. +""" function induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) if isempty(nodes) return GNNGraph() # Return empty graph if no nodes are provided @@ -73,6 +148,22 @@ function induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) return GNNGraph(source, target, ndata = new_features) # Return the new GNNGraph with subgraph and features end +""" + Base.iterate(loader::NeighborLoader, state::Int=1) -> Tuple{GNNGraph, Int} + +Implements the iterator protocol for `NeighborLoader`, allowing mini-batch processing for neighbor sampling in GNNs. + Each call to `iterate` returns a mini-batch subgraph with sampled neighbors for a batch of input nodes, + expanding their neighborhoods for a specified number of layers. + +# Arguments: +- `loader::NeighborLoader`: The `NeighborLoader` instance to sample neighbors from. +- `state::Int`: The current position in the input nodes for batching. Defaults to 1. + +# Returns: +A tuple `(mini_batch_gnn, next_state)` where: +- `mini_batch_gnn::GNNGraph`: The subgraph induced by the sampled nodes and their neighbors for the current mini-batch. +- `next_state::Int`: The next state (index) for iterating through the input nodes. If the input nodes are exhausted, returns `nothing`. +""" # Iterator protocol for NeighborLoader with lazy batch loading function Base.iterate(loader::NeighborLoader, state=1) if state > length(loader.input_nodes) @@ -113,4 +204,4 @@ function Base.iterate(loader::NeighborLoader, state=1) # Continue iteration for the next batch return mini_batch_gnn, state + batch_size -end \ No newline at end of file +end From abf31cd53deaed023c75a6164e2933fe1a127ed0 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sun, 29 Sep 2024 20:42:59 +0200 Subject: [PATCH 14/27] chore: Graphs to deps --- GraphNeuralNetworks/Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GraphNeuralNetworks/Project.toml b/GraphNeuralNetworks/Project.toml index 89979ff69..405a5018c 100644 --- a/GraphNeuralNetworks/Project.toml +++ b/GraphNeuralNetworks/Project.toml @@ -9,6 +9,7 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" GNNGraphs = "aed8fd31-079b-4b5a-b342-a13352159b8c" GNNlib = "a6a84749-d869-43f8-aacc-be26a1996e48" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" @@ -46,7 +47,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" -Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" From bcdfa5e82dee0b7ffcb4622fb16bf6ad18c15f68 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sun, 29 Sep 2024 20:43:45 +0200 Subject: [PATCH 15/27] chore: move using Graphs to main file --- GraphNeuralNetworks/src/GraphNeuralNetworks.jl | 1 + GraphNeuralNetworks/src/samplers.jl | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/GraphNeuralNetworks/src/GraphNeuralNetworks.jl b/GraphNeuralNetworks/src/GraphNeuralNetworks.jl index ec5d5603b..8066cc1fb 100644 --- a/GraphNeuralNetworks/src/GraphNeuralNetworks.jl +++ b/GraphNeuralNetworks/src/GraphNeuralNetworks.jl @@ -10,6 +10,7 @@ using NNlib: scatter, gather using ChainRulesCore using Reexport using MLUtils: zeros_like +using Graphs using GNNGraphs: COO_T, ADJMAT_T, SPARSE_T, check_num_nodes, check_num_edges, diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 4157b52fc..647edd688 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -1,6 +1,3 @@ -using GraphNeuralNetworks -using Graphs - """ struct NeighborLoader From 970d2973388833ff5b84aacdd91cbb926684b4b6 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sun, 29 Sep 2024 20:45:49 +0200 Subject: [PATCH 16/27] chore: readd Graphs to extras --- GraphNeuralNetworks/Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/GraphNeuralNetworks/Project.toml b/GraphNeuralNetworks/Project.toml index 405a5018c..392d72a1f 100644 --- a/GraphNeuralNetworks/Project.toml +++ b/GraphNeuralNetworks/Project.toml @@ -47,6 +47,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" From b4c1ad71346a474ce119ca5f2b2efdbcb5ba6ab8 Mon Sep 17 00:00:00 2001 From: Agata Skorupka <45850123+askorupka@users.noreply.github.com> Date: Sun, 29 Sep 2024 20:58:43 +0200 Subject: [PATCH 17/27] chore: delete src/samplers.jl created by mistake --- src/samplers.jl | 116 ------------------------------------------------ 1 file changed, 116 deletions(-) delete mode 100644 src/samplers.jl diff --git a/src/samplers.jl b/src/samplers.jl deleted file mode 100644 index 4502bc865..000000000 --- a/src/samplers.jl +++ /dev/null @@ -1,116 +0,0 @@ -using GraphNeuralNetworks -using Graphs - -# Define a NeighborLoader structure for sampling neighbors -struct NeighborLoader - graph::GNNGraph # The input GNNGraph (graph + features from GraphNeuralNetworks.jl) - num_neighbors::Vector{Int} # Number of neighbors to sample per node, for each layer - input_nodes::Vector{Int} # Set of input nodes (starting nodes for sampling) - num_layers::Int # Number of layers for neighborhood expansion - batch_size::Union{Int, Nothing} # Optional batch size, defaults to the length of input_nodes if not given - neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation -end - -# Constructor for NeighborLoader with optional batch size -function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) - return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) -end - -# Function to get cached neighbors or compute them -function get_neighbors(loader::NeighborLoader, node::Int) - if haskey(loader.neighbors_cache, node) - return loader.neighbors_cache[node] - else - neighbors = Graphs.neighbors(loader.graph, node, dir = :in) # Get neighbors from graph - loader.neighbors_cache[node] = neighbors - return neighbors - end -end - -# Function to sample neighbors for a given node at a specific layer -function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) - neighbors = get_neighbors(loader, node) - if isempty(neighbors) - return Int[] - else - num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer - return rand(neighbors, num_samples) # Randomly sample neighbors - end -end - -function induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) - if isempty(nodes) - return GNNGraph() # Return empty graph if no nodes are provided - end - - node_map = Dict(node => i for (i, node) in enumerate(nodes)) - - # Collect edges to add - source = Int[] - target = Int[] - backup_gnn = GNNGraph() - for node in nodes - neighbors = Graphs.neighbors(graph, node, dir = :in) - if isempty(neighbors) - backup_gnn = add_nodes(backup_gnn, 1) - end - for neighbor in neighbors - if neighbor in keys(node_map) - push!(source, node_map[node]) - push!(target, node_map[neighbor]) - end - end - end - - # Extract features for the new nodes - new_features = graph.x[:, nodes] - - if isempty(source) && isempty(target) - backup_gnn.ndata.x = new_features - return backup_gnn # Return empty graph if no nodes are provided - end - - return GNNGraph(source, target, ndata = new_features) # Return the new GNNGraph with subgraph and features -end - -# Iterator protocol for NeighborLoader with lazy batch loading -function Base.iterate(loader::NeighborLoader, state=1) - if state > length(loader.input_nodes) - return nothing # End of iteration if batches are exhausted (state larger than amount of input nodes or current batch no >= batch number) - end - - # Determine the size of the current batch - batch_size = min(loader.batch_size, length(loader.input_nodes) - state + 1) # Conditional in case there is not enough nodes to fill the last batch - batch_nodes = loader.input_nodes[state:state + batch_size - 1] # Each mini-batch uses different set of input nodes - - # Set for tracking the subgraph nodes - subgraph_nodes = Set(batch_nodes) - - for node in batch_nodes - # Initialize current layer of nodes (starting with the node itself) - sampled_neighbors = Set([node]) - - # For each GNN layer, sample the neighborhood - for layer in 1:loader.num_layers - new_neighbors = Set{Int}() - for n in sampled_neighbors - neighbors = sample_neighbors(loader, n, layer) # Sample neighbors of the node for this layer - new_neighbors = union(new_neighbors, neighbors) # Avoid duplicates in the neighbor set - end - sampled_neighbors = new_neighbors - subgraph_nodes = union(subgraph_nodes, sampled_neighbors) # Expand the subgraph with the new neighbors - end - end - - # Collect subgraph nodes and their features - subgraph_node_list = collect(subgraph_nodes) - - if isempty(subgraph_node_list) - return GNNGraph(), state + batch_size - end - - mini_batch_gnn = induced_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes - - # Continue iteration for the next batch - return mini_batch_gnn, state + batch_size -end From 5e7544ce716c699c2115c27b80af7c2c46967a86 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sat, 12 Oct 2024 13:57:33 +0200 Subject: [PATCH 18/27] fix: add sampling.jl to docs --- GraphNeuralNetworks/docs/src/api/gnngraph.md | 6 ++ GraphNeuralNetworks/src/samplers.jl | 61 ++------------------ 2 files changed, 12 insertions(+), 55 deletions(-) diff --git a/GraphNeuralNetworks/docs/src/api/gnngraph.md b/GraphNeuralNetworks/docs/src/api/gnngraph.md index f708c3840..d251e02dc 100644 --- a/GraphNeuralNetworks/docs/src/api/gnngraph.md +++ b/GraphNeuralNetworks/docs/src/api/gnngraph.md @@ -91,4 +91,10 @@ Private = false ```@docs Graphs.induced_subgraph(::GNNGraph, ::Vector{Int}) +``` + +```@autodocs +Modules = [GraphNeuralNetworks] +Pages = ["samplers"] +Private = false ``` \ No newline at end of file diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 647edd688..2d65dc217 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -4,7 +4,7 @@ A data structure for sampling neighbors from a graph for training Graph Neural Networks (GNNs). It supports multi-layer sampling of neighbors for a batch of input nodes, useful for mini-batch training. -# Fields: +# Fields - `graph::GNNGraph`: The input graph containing nodes and edges, along with node features (from GraphNeuralNetworks.jl). - `num_neighbors::Vector{Int}`: A vector specifying the number of neighbors to sample per node at each GNN layer. - `input_nodes::Vector{Int}`: A vector containing the starting nodes for neighbor sampling. @@ -12,7 +12,7 @@ It supports multi-layer sampling of neighbors for a batch of input nodes, useful - `batch_size::Union{Int, Nothing}`: The size of the batch. If not specified, it defaults to the number of `input_nodes`. - `neighbors_cache::Dict{Int, Vector{Int}}`: A cache to store sampled neighbors for each node, preventing redundant sampling. -# Usage: +# Usage ```julia loader = NeighborLoader(graph; num_neighbors=[10, 5], input_nodes=[1, 2, 3], num_layers=2) ``` @@ -48,7 +48,7 @@ function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes end """ - get_neighbors(loader::NeighborLoader, node::Int) -> Vector{Int} + get_neighbors(loader::NeighborLoader, node::Int) Returns the neighbors of a given `node` in the graph from the `NeighborLoader`. It first checks if the neighbors are cached; if not, it retrieves the neighbors from the graph and caches them for future use. @@ -72,7 +72,7 @@ function get_neighbors(loader::NeighborLoader, node::Int) end """ - sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) -> Vector{Int} + sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) Samples a specified number of neighbors for the given `node` at a particular `layer` of the GNN. The number of neighbors sampled is defined in `loader.num_neighbors`. @@ -97,56 +97,7 @@ function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) end """ - induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) -> GNNGraph - -Generates a subgraph from the original graph using the provided `nodes`. - The function includes the nodes' neighbors and creates edges between nodes that are connected in the original graph. - If a node has no neighbors, an isolated node will be added to the subgraph. - -# Arguments: -- `graph::GNNGraph`: The original graph containing nodes, edges, and node features. -- `nodes::Vector{Int}`: A vector of node indices to include in the subgraph. - -# Returns: -A new `GNNGraph` containing the subgraph with the specified nodes and their features. -""" -function induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) - if isempty(nodes) - return GNNGraph() # Return empty graph if no nodes are provided - end - - node_map = Dict(node => i for (i, node) in enumerate(nodes)) - - # Collect edges to add - source = Int[] - target = Int[] - backup_gnn = GNNGraph() - for node in nodes - neighbors = Graphs.neighbors(graph, node, dir = :in) - if isempty(neighbors) - backup_gnn = add_nodes(backup_gnn, 1) - end - for neighbor in neighbors - if neighbor in keys(node_map) - push!(source, node_map[node]) - push!(target, node_map[neighbor]) - end - end - end - - # Extract features for the new nodes - new_features = graph.x[:, nodes] - - if isempty(source) && isempty(target) - backup_gnn.ndata.x = new_features - return backup_gnn # Return empty graph if no nodes are provided - end - - return GNNGraph(source, target, ndata = new_features) # Return the new GNNGraph with subgraph and features -end - -""" - Base.iterate(loader::NeighborLoader, state::Int=1) -> Tuple{GNNGraph, Int} + Base.iterate(loader::NeighborLoader, state::Int=1) Implements the iterator protocol for `NeighborLoader`, allowing mini-batch processing for neighbor sampling in GNNs. Each call to `iterate` returns a mini-batch subgraph with sampled neighbors for a batch of input nodes, @@ -197,7 +148,7 @@ function Base.iterate(loader::NeighborLoader, state=1) return GNNGraph(), state + batch_size end - mini_batch_gnn = induced_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes + mini_batch_gnn = Graphs.induced_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes # Continue iteration for the next batch return mini_batch_gnn, state + batch_size From c9d412b41c195a3588f2e9045615707a27a10732 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sat, 12 Oct 2024 14:22:47 +0200 Subject: [PATCH 19/27] fix: add sampling.jl to docs --- GraphNeuralNetworks/docs/src/api/gnngraph.md | 2 +- GraphNeuralNetworks/test/samplers.jl | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/GraphNeuralNetworks/docs/src/api/gnngraph.md b/GraphNeuralNetworks/docs/src/api/gnngraph.md index d251e02dc..0ce60392b 100644 --- a/GraphNeuralNetworks/docs/src/api/gnngraph.md +++ b/GraphNeuralNetworks/docs/src/api/gnngraph.md @@ -95,6 +95,6 @@ Graphs.induced_subgraph(::GNNGraph, ::Vector{Int}) ```@autodocs Modules = [GraphNeuralNetworks] -Pages = ["samplers"] +Pages = ["samplers.jl"] Private = false ``` \ No newline at end of file diff --git a/GraphNeuralNetworks/test/samplers.jl b/GraphNeuralNetworks/test/samplers.jl index 5091d14de..546291717 100644 --- a/GraphNeuralNetworks/test/samplers.jl +++ b/GraphNeuralNetworks/test/samplers.jl @@ -1,6 +1,3 @@ -using Test -using GraphNeuralNetworks - # Helper function to create a simple graph with node features using GNNGraph function create_test_graph() source = [1, 2, 3, 4] # Define source nodes of edges From 2d7bd0b338c8ef62cb8a827ce7882283c4dd480f Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sat, 12 Oct 2024 19:52:59 +0200 Subject: [PATCH 20/27] fix: add sampling.jl to docs --- GraphNeuralNetworks/docs/make.jl | 1 + GraphNeuralNetworks/docs/src/api/gnngraph.md | 6 ------ GraphNeuralNetworks/docs/src/api/samplers.md | 14 ++++++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) create mode 100644 GraphNeuralNetworks/docs/src/api/samplers.md diff --git a/GraphNeuralNetworks/docs/make.jl b/GraphNeuralNetworks/docs/make.jl index 869aa94f1..8cb762166 100644 --- a/GraphNeuralNetworks/docs/make.jl +++ b/GraphNeuralNetworks/docs/make.jl @@ -46,6 +46,7 @@ makedocs(; "Message Passing" => "api/messagepassing.md", "Heterogeneous Graphs" => "api/heterograph.md", "Temporal Graphs" => "api/temporalgraph.md", + "Samplers" => "api/samplers.md", "Utils" => "api/utils.md", ], "Developer Notes" => "dev.md", diff --git a/GraphNeuralNetworks/docs/src/api/gnngraph.md b/GraphNeuralNetworks/docs/src/api/gnngraph.md index 0ce60392b..f708c3840 100644 --- a/GraphNeuralNetworks/docs/src/api/gnngraph.md +++ b/GraphNeuralNetworks/docs/src/api/gnngraph.md @@ -91,10 +91,4 @@ Private = false ```@docs Graphs.induced_subgraph(::GNNGraph, ::Vector{Int}) -``` - -```@autodocs -Modules = [GraphNeuralNetworks] -Pages = ["samplers.jl"] -Private = false ``` \ No newline at end of file diff --git a/GraphNeuralNetworks/docs/src/api/samplers.md b/GraphNeuralNetworks/docs/src/api/samplers.md new file mode 100644 index 000000000..f4285562c --- /dev/null +++ b/GraphNeuralNetworks/docs/src/api/samplers.md @@ -0,0 +1,14 @@ +```@meta +CurrentModule = GraphNeuralNetworks +``` + +# Samplers + + +## Docs + +```@autodocs +Modules = [GraphNeuralNetworks] +Pages = ["samplers.jl"] +Private = false +``` From 65aa564cf50598d65cb1628cd475ca9d547bd9b9 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sat, 12 Oct 2024 21:30:33 +0200 Subject: [PATCH 21/27] fix: deduplicate function --- GraphNeuralNetworks/src/samplers.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 2d65dc217..71383e539 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -72,7 +72,7 @@ function get_neighbors(loader::NeighborLoader, node::Int) end """ - sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) + sample_nbrs(loader::NeighborLoader, node::Int, layer::Int) Samples a specified number of neighbors for the given `node` at a particular `layer` of the GNN. The number of neighbors sampled is defined in `loader.num_neighbors`. @@ -86,7 +86,7 @@ Samples a specified number of neighbors for the given `node` at a particular `la A vector of sampled neighbor node indices. """ # Function to sample neighbors for a given node at a specific layer -function sample_neighbors(loader::NeighborLoader, node::Int, layer::Int) +function sample_nbrs(loader::NeighborLoader, node::Int, layer::Int) neighbors = get_neighbors(loader, node) if isempty(neighbors) return Int[] @@ -133,7 +133,7 @@ function Base.iterate(loader::NeighborLoader, state=1) for layer in 1:loader.num_layers new_neighbors = Set{Int}() for n in sampled_neighbors - neighbors = sample_neighbors(loader, n, layer) # Sample neighbors of the node for this layer + neighbors = sample_nbrs(loader, n, layer) # Sample neighbors of the node for this layer new_neighbors = union(new_neighbors, neighbors) # Avoid duplicates in the neighbor set end sampled_neighbors = new_neighbors From 61c5e39da4a51385873757b38d5e0cac9e1a1ab7 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Thu, 24 Oct 2024 22:15:07 +0200 Subject: [PATCH 22/27] fix: fix broken tests --- GNNGraphs/src/sampling.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/GNNGraphs/src/sampling.jl b/GNNGraphs/src/sampling.jl index 7e723182a..724ba2b1e 100644 --- a/GNNGraphs/src/sampling.jl +++ b/GNNGraphs/src/sampling.jl @@ -177,6 +177,8 @@ function Graphs.induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) node_map = Dict(node => i for (i, node) in enumerate(nodes)) + edge_list = [collect(t) for t in zip(edge_index(graph)[1],edge_index(graph)[2])] + # Collect edges to add source = Int[] target = Int[] @@ -188,7 +190,11 @@ function Graphs.induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) push!(target, node_map[node]) push!(source, node_map[neighbor]) - eindex = findfirst(x -> x == [neighbor, node], edge_index(graph)) + println(edge_index(graph)) + println([neighbor, node]) + println(findfirst(x -> x == [neighbor, node], edge_list)) + + eindex = findfirst(x -> x == [neighbor, node], edge_list) push!(eindices, eindex) end end From aec5574ea7c2c214fd92bc4a1de14fea75157b0c Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Thu, 24 Oct 2024 22:23:17 +0200 Subject: [PATCH 23/27] chore: remove printlns --- GNNGraphs/src/sampling.jl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/GNNGraphs/src/sampling.jl b/GNNGraphs/src/sampling.jl index 724ba2b1e..e78a2f299 100644 --- a/GNNGraphs/src/sampling.jl +++ b/GNNGraphs/src/sampling.jl @@ -190,10 +190,6 @@ function Graphs.induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) push!(target, node_map[node]) push!(source, node_map[neighbor]) - println(edge_index(graph)) - println([neighbor, node]) - println(findfirst(x -> x == [neighbor, node], edge_list)) - eindex = findfirst(x -> x == [neighbor, node], edge_list) push!(eindices, eindex) end From e6750867052516cce84313816c6e3ce2270c68d2 Mon Sep 17 00:00:00 2001 From: Agata Skorupka <45850123+askorupka@users.noreply.github.com> Date: Sun, 27 Oct 2024 19:25:08 +0100 Subject: [PATCH 24/27] Update GraphNeuralNetworks/src/samplers.jl Co-authored-by: Carlo Lucibello --- GraphNeuralNetworks/src/samplers.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 71383e539..871c71566 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -10,7 +10,6 @@ It supports multi-layer sampling of neighbors for a batch of input nodes, useful - `input_nodes::Vector{Int}`: A vector containing the starting nodes for neighbor sampling. - `num_layers::Int`: The number of layers for neighborhood expansion (how far to sample neighbors). - `batch_size::Union{Int, Nothing}`: The size of the batch. If not specified, it defaults to the number of `input_nodes`. -- `neighbors_cache::Dict{Int, Vector{Int}}`: A cache to store sampled neighbors for each node, preventing redundant sampling. # Usage ```julia From 62f5d8723056cfe391ba2dc48d15ad8419fe7dba Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sun, 27 Oct 2024 19:28:41 +0100 Subject: [PATCH 25/27] fix: remove docstrings where not needed --- GNNGraphs/src/sampling.jl | 1 - GraphNeuralNetworks/src/samplers.jl | 46 ----------------------------- 2 files changed, 47 deletions(-) diff --git a/GNNGraphs/src/sampling.jl b/GNNGraphs/src/sampling.jl index e78a2f299..6e38730f0 100644 --- a/GNNGraphs/src/sampling.jl +++ b/GNNGraphs/src/sampling.jl @@ -189,7 +189,6 @@ function Graphs.induced_subgraph(graph::GNNGraph, nodes::Vector{Int}) if neighbor in keys(node_map) push!(target, node_map[node]) push!(source, node_map[neighbor]) - eindex = findfirst(x -> x == [neighbor, node], edge_list) push!(eindices, eindex) end diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 871c71566..fa165334e 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -25,40 +25,10 @@ struct NeighborLoader neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation end -### `NeighborLoader` constructor -""" - NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) - -Creates a `NeighborLoader` to sample neighbors from the provided `graph` for the training. - The loader supports batching and multi-layer neighbor sampling. - -# Arguments: -- `graph::GNNGraph`: The input graph with node features. -- `num_neighbors::Vector{Int}`: Number of neighbors to sample per node, per layer. -- `input_nodes::Vector{Int}`: Set of starting nodes for sampling. -- `num_layers::Int`: Number of layers to expand neighborhoods for sampling. -- `batch_size::Union{Int, Nothing}`: Optional batch size. If `nothing`, it defaults to the length of `input_nodes`. - -# Returns: -A `NeighborLoader` object. -""" function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}, num_layers::Int, batch_size::Union{Int, Nothing}=nothing) return NeighborLoader(graph, num_neighbors, input_nodes, num_layers, batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) end -""" - get_neighbors(loader::NeighborLoader, node::Int) - -Returns the neighbors of a given `node` in the graph from the `NeighborLoader`. - It first checks if the neighbors are cached; if not, it retrieves the neighbors from the graph and caches them for future use. - -# Arguments: -- `loader::NeighborLoader`: The `NeighborLoader` instance. -- `node::Int`: The node whose neighbors you want to sample. - -# Returns: -A vector of neighbor node indices. -""" # Function to get cached neighbors or compute them function get_neighbors(loader::NeighborLoader, node::Int) if haskey(loader.neighbors_cache, node) @@ -95,22 +65,6 @@ function sample_nbrs(loader::NeighborLoader, node::Int, layer::Int) end end -""" - Base.iterate(loader::NeighborLoader, state::Int=1) - -Implements the iterator protocol for `NeighborLoader`, allowing mini-batch processing for neighbor sampling in GNNs. - Each call to `iterate` returns a mini-batch subgraph with sampled neighbors for a batch of input nodes, - expanding their neighborhoods for a specified number of layers. - -# Arguments: -- `loader::NeighborLoader`: The `NeighborLoader` instance to sample neighbors from. -- `state::Int`: The current position in the input nodes for batching. Defaults to 1. - -# Returns: -A tuple `(mini_batch_gnn, next_state)` where: -- `mini_batch_gnn::GNNGraph`: The subgraph induced by the sampled nodes and their neighbors for the current mini-batch. -- `next_state::Int`: The next state (index) for iterating through the input nodes. If the input nodes are exhausted, returns `nothing`. -""" # Iterator protocol for NeighborLoader with lazy batch loading function Base.iterate(loader::NeighborLoader, state=1) if state > length(loader.input_nodes) From 3ed22bf68b04c93b19561566afe1fa0a06d12057 Mon Sep 17 00:00:00 2001 From: Agata Skorupka Date: Sun, 27 Oct 2024 19:31:17 +0100 Subject: [PATCH 26/27] chore: add ref to the paper --- GraphNeuralNetworks/src/samplers.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index fa165334e..7de01dbc4 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -2,7 +2,9 @@ struct NeighborLoader A data structure for sampling neighbors from a graph for training Graph Neural Networks (GNNs). -It supports multi-layer sampling of neighbors for a batch of input nodes, useful for mini-batch training. +It supports multi-layer sampling of neighbors for a batch of input nodes, useful for mini-batch training +originally introduced in "Inductive Representation Learning on Large Graphs" paper. +[see https://arxiv.org/abs/1706.02216] # Fields - `graph::GNNGraph`: The input graph containing nodes and edges, along with node features (from GraphNeuralNetworks.jl). From e4dc97700948b594226189ee3ba8b3f0ed7db5c5 Mon Sep 17 00:00:00 2001 From: Agata Skorupka <45850123+askorupka@users.noreply.github.com> Date: Tue, 29 Oct 2024 13:08:38 +0100 Subject: [PATCH 27/27] Update GraphNeuralNetworks/src/samplers.jl Co-authored-by: Carlo Lucibello --- GraphNeuralNetworks/src/samplers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GraphNeuralNetworks/src/samplers.jl b/GraphNeuralNetworks/src/samplers.jl index 7de01dbc4..e7dd4b5ea 100644 --- a/GraphNeuralNetworks/src/samplers.jl +++ b/GraphNeuralNetworks/src/samplers.jl @@ -7,7 +7,7 @@ originally introduced in "Inductive Representation Learning on Large Graphs" pap [see https://arxiv.org/abs/1706.02216] # Fields -- `graph::GNNGraph`: The input graph containing nodes and edges, along with node features (from GraphNeuralNetworks.jl). +- `graph::GNNGraph`: The input graph. - `num_neighbors::Vector{Int}`: A vector specifying the number of neighbors to sample per node at each GNN layer. - `input_nodes::Vector{Int}`: A vector containing the starting nodes for neighbor sampling. - `num_layers::Int`: The number of layers for neighborhood expansion (how far to sample neighbors).