GCP-DSClient (Google Cloud Platform Data Science Client) helps interactive data science by coordinating data science libraries (pandas, ipython) and Google Cloud Infrastructure.
This client supports the following Google Cloud Platform services:
$ pip install -U git+https://github.com/orfeon/gcp-python-dsclient $ pip3 install -U git+https://github.com/orfeon/gcp-python-dsclient
import dsclient
# When run on GCE instance that has GCP access permission,
# you need to set only project name.
client = dsclient.Client("your project name")
# In other case (ex: run on local PC),
# you need to set project name, and access key file path.
client = dsclient.Client("your project name", "./keyfile.json")
Google BigQuery (BigQuery API docs)
# Query and read data as pandas.DataFrame
query_string = """
SELECT date, year
FROM aaa
WHERE year = 2016
"""
df = client.query(query_string) # Use lquery() for large data.
# Upload(Append) pandas.DataFrame to existing table on BigQuery.
client.load(df, "your_dataset.your_table")
# Override existing table.
client.load(df, "your_dataset.your_table", append=False)
# Insert query result into table. (Override if table exists)
client.query(query_string, table_name="your_dataset.your_table_2")
Google Cloud Storage (Storage API docs)
official Google Cloud Storage documentation
import pandas as pd
# Write local pandas.DataFrame to Cloud Storage.
df1 = pd.DataFrame(...somedata...)
client.write_csv(df1, "gs://your_bucket/your_file1.csv")
# Read pandas.DataFrame from csv file on Cloud Storage.
df2 = client.read_csv("gs://your_bucket/your_file2.csv")
# Write blob data (ex: ML model) to Cloud Storage.
reg = LinearRegressor()
reg.fit(df1[["attr1","attr2",...]], df1["target"])
client.write_blob(reg, "gs://your_bucket/your_file.model")
# Read blob data from Cloud Storage.
reg = client.read_blob("gs://your_bucket/your_file.model")
prd = reg.predict(df2[["attr1","attr2",...]])
Google Cloud Datastore (Cloud Datastore API docs)
# Query and read data using GQL as pandas.DataFrame
df = client.gql("SELECT * FROM SomeKind WHERE date = '20170101'")
# create snapshot of current instance on GCE.
client.create_current_snapshot("snapshot_name")
# deploy ipcluster from snapshot.
client.deploy_ipcluster(profile="mycluster", snapshot="snapshot_name",
itype="standard", core=4, num=4)
# execute some tasks on ipcluster.
import ipyparallel
rc = ipyparallel.Client(profile='mycluster')
lview = rc.load_balanced_view()
results = lview.map_async(some_func, task_list)
# add ipengine to existing ipcluster.
client.add_ipengine(profile="mycluster", snapshot="snapshot_name",
itype="small", num=4)
# delete ipcluster.
client.delete_ipcluster(profile="mycluster")
# stop current instance when all tasks are finished.
client.stop_current_instance()
Apache 2.0 - See LICENSE for more information.