获取表及数据库
from google.cloud import bigquery
client = bigquery.Client()
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
tables = list(client.list_tables(dataset)
for table in tables:
print(table.table_id)
table_ref = dataset_ref.table("full")
table = client.get_table(table_ref)
table.schema
client.list_rows(table, max_results=5).to_dataframe()
client.list_rows(table, selected_fields=table.schema[:1], max_results=5).to_dataframe()
检索数据
query = """
SELECT city, country
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE country = 'US'
"""
query = """
SELECT *
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE country = 'US'
"""
query = """
SELECT score, title
FROM `bigquery-public-data.hacker_news.full`
WHERE type = "job"
"""
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
dry_run_query_job = client.query(query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))
ONE_MB = 1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)
safe_query_job = client.query(query, job_config=safe_config)
safe_query_job.to_dataframe()
Group By, Having & Count 分组 计数
query_popular = """
SELECT parent, COUNT(id)
FROM `bigquery-public-data.hacker_news.comments`
GROUP BY parent
HAVING COUNT(id) > 10
"""
Orderrrr By 排序
query = """
SELECT COUNT(consecutive_number) AS num_accidents,
EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS day_of_week
FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
GROUP BY day_of_week
ORDER BY num_accidents DESC
"""
As & With
query_with_CTE = """
WITH time AS
(
SELECT DATE(block_timestamp) AS trans_date
FROM `bigquery-public-data.crypto_bitcoin.transactions`
)
SELECT COUNT(1) AS transactions,
trans_date
FROM time
GROUP BY trans_date
ORDER BY trans_date
"""
speeds_query = """
WITH RelevantRides AS
(
SELECT trip_seconds,trip_miles,
EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE trip_seconds > 0 and trip_miles > 0
and trip_start_timestamp > '2017-01-01' AND trip_start_timestamp < '2017-07-01'
)
SELECT hour_of_day,COUNT(1) as num_trips,
3600 * SUM(trip_miles) / SUM(trip_seconds) as avg_mph
FROM RelevantRides
GROUP BY hour_of_day
ORDER BY num_trips DESC
"""
Joining Data
bigquery_experts_query = """
SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
ON q.id = a.parent_Id
WHERE q.tags LIKE '%bigquery%'
GROUP BY a.owner_user_id
"""
JOINs and UNIONs
join_query = """
WITH c AS
(
SELECT parent, COUNT(*) as num_comments
FROM `bigquery-public-data.hacker_news.comments`
GROUP BY parent
)
SELECT s.id as story_id, s.by, s.title, c.num_comments
FROM `bigquery-public-data.hacker_news.stories` AS s
LEFT JOIN c
ON s.id = c.parent
WHERE EXTRACT(DATE FROM s.time_ts) = '2012-01-01'
ORDER BY c.num_comments DESC
"""
union_query = """
SELECT c.by
FROM `bigquery-public-data.hacker_news.comments` AS c
WHERE EXTRACT(DATE FROM c.time_ts) = '2014-01-01'
UNION DISTINCT
SELECT s.by
FROM `bigquery-public-data.hacker_news.stories` AS s
WHERE EXTRACT(DATE FROM s.time_ts) = '2014-01-01'
"""
Analytic Functions
WITH trips_by_day AS
(
SELECT DATE(start_date) AS trip_date,
COUNT(*) as num_trips
FROM `bigquery-public-data.san_francisco.bikeshare_trips`
WHERE EXTRACT(YEAR FROM start_date) = 2015
GROUP BY trip_date
)
SELECT *,
SUM(num_trips)
OVER (
ORDER BY trip_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cumulative_trips
FROM trips_by_day
- MIN() (or MAX() ) - Returns the minimum (or maximum) of input values
- AVG() (or SUM() ) - Returns the average (or sum) of input values
- COUNT() - Returns the number of rows in the input
- FIRST_VALUE() (or LAST_VALUE() ) - Returns the first (or last) value in the input
- LEAD() (and LAG() ) - Returns the value on a subsequent (or preceding) row
- ROW_NUMBER() - Returns the order in which rows appear in the input (starting with
1)
- RANK() - All rows with the same value in the ordering column receive the same rank value, where the next row receives a rank value which increments by the number of rows with the previous rank value.
start_end_query = """
SELECT bike_number,
TIME(start_date) AS trip_time,
FIRST_VALUE(start_station_id)
OVER (
PARTITION BY bike_number
ORDER BY start_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS first_station_id,
LAST_VALUE(end_station_id)
OVER (
PARTITION BY bike_number
ORDER BY start_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS last_station_id,
start_station_id,
end_station_id
FROM `bigquery-public-data.san_francisco.bikeshare_trips`
WHERE DATE(start_date) = '2015-10-25'
"""
Nested and Repeated Data
query = """
SELECT hits.page.pagePath as path,
COUNT(hits.page.pagePath) as counts
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`,
UNNEST(hits) as hits
WHERE hits.type="PAGE" and hits.hitNumber=1
GROUP BY path
ORDER BY counts DESC
"""
Writing Efficient Queries
- Only select the columns you want.
- Read less data.
- Avoid N:N JOINs.