SQL and BigQuery 基本用法

942 阅读2分钟

获取表及数据库

# 引入库
from google.cloud import bigquery
# 生成一个 Client
client = bigquery.Client()
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")
# 获取数据库数据
dataset = client.get_dataset(dataset_ref)
# 获取 tables
tables = list(client.list_tables(dataset)
for table in tables:
    print(table.table_id)
 # 获取 table 引用
 table_ref = dataset_ref.table("full")
 # 获取 table
 table = client.get_table(table_ref)
 # 展示 table schema
 table.schema
 # 获取table 的头5行转成 DataFrame
 client.list_rows(table, max_results=5).to_dataframe()
 # 也可以指定列
 client.list_rows(table, selected_fields=table.schema[:1], max_results=5).to_dataframe()
 # 

检索数据

query = """
        SELECT city, country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """
query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """
       
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# 创建 QueryJobConfig 对象以估计查询的大小,而无需运行它
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

# 仅运行1MB以下的查询
ONE_MB = 1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

safe_query_job = client.query(query, job_config=safe_config)

safe_query_job.to_dataframe()

Group By, Having & Count 分组 计数

# Query to select comments that received more than 10 replies
query_popular = """
                SELECT parent, COUNT(id)
                FROM `bigquery-public-data.hacker_news.comments`
                GROUP BY parent
                HAVING COUNT(id) > 10
                """

Orderrrr By 排序

# 按降序排列的一周内每天事故发生的次数 
query = """
        SELECT COUNT(consecutive_number) AS num_accidents, 
               EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS day_of_week
        FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2015`
        GROUP BY day_of_week
        ORDER BY num_accidents DESC
        """

As & With

query_with_CTE = """ 
                 WITH time AS 
                 (
                     SELECT DATE(block_timestamp) AS trans_date
                     FROM `bigquery-public-data.crypto_bitcoin.transactions`
                 )
                 SELECT COUNT(1) AS transactions,
                        trans_date
                 FROM time
                 GROUP BY trans_date
                 ORDER BY trans_date
                 """
speeds_query = """
               WITH RelevantRides AS
               (
                   SELECT trip_seconds,trip_miles,
                   EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day
                   FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                   WHERE trip_seconds > 0 and trip_miles > 0 
                   and trip_start_timestamp > '2017-01-01' AND trip_start_timestamp < '2017-07-01'
               )
               SELECT hour_of_day,COUNT(1) as num_trips,
               3600 * SUM(trip_miles) / SUM(trip_seconds) as avg_mph
               FROM RelevantRides
               GROUP BY hour_of_day
               ORDER BY num_trips DESC
               """

Joining Data

bigquery_experts_query = """
                         SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
                         FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
                         INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                             ON q.id = a.parent_Id
                         WHERE q.tags LIKE '%bigquery%'
                         GROUP BY a.owner_user_id
                         """

JOINs and UNIONs

join_query = """
             WITH c AS
             (
             SELECT parent, COUNT(*) as num_comments
             FROM `bigquery-public-data.hacker_news.comments` 
             GROUP BY parent
             )
             SELECT s.id as story_id, s.by, s.title, c.num_comments
             FROM `bigquery-public-data.hacker_news.stories` AS s
             LEFT JOIN c
             ON s.id = c.parent
             WHERE EXTRACT(DATE FROM s.time_ts) = '2012-01-01'
             ORDER BY c.num_comments DESC
             """
union_query = """
              SELECT c.by
              FROM `bigquery-public-data.hacker_news.comments` AS c
              WHERE EXTRACT(DATE FROM c.time_ts) = '2014-01-01'
              UNION DISTINCT
              SELECT s.by
              FROM `bigquery-public-data.hacker_news.stories` AS s
              WHERE EXTRACT(DATE FROM s.time_ts) = '2014-01-01'
              """

Analytic Functions

WITH trips_by_day AS
                  (
                  SELECT DATE(start_date) AS trip_date,
                      COUNT(*) as num_trips
                  FROM `bigquery-public-data.san_francisco.bikeshare_trips`
                  WHERE EXTRACT(YEAR FROM start_date) = 2015
                  GROUP BY trip_date
                  )
                  SELECT *,
                      SUM(num_trips) 
                          OVER (
                               ORDER BY trip_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                               ) AS cumulative_trips
                      FROM trips_by_day
  • MIN()  (or MAX() ) - Returns the minimum (or maximum) of input values
  • AVG()  (or SUM() ) - Returns the average (or sum) of input values
  • COUNT()  - Returns the number of rows in the input
  • FIRST_VALUE()  (or LAST_VALUE() ) - Returns the first (or last) value in the input
  • LEAD()  (and LAG() ) - Returns the value on a subsequent (or preceding) row
  • ROW_NUMBER()  - Returns the order in which rows appear in the input (starting with 1)
  • RANK()  - All rows with the same value in the ordering column receive the same rank value, where the next row receives a rank value which increments by the number of rows with the previous rank value.
start_end_query = """
                  SELECT bike_number,
                      TIME(start_date) AS trip_time,
                      FIRST_VALUE(start_station_id)
                          OVER (
                               PARTITION BY bike_number
                               ORDER BY start_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                               ) AS first_station_id,
                      LAST_VALUE(end_station_id)
                          OVER (
                               PARTITION BY bike_number
                               ORDER BY start_date
                               ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
                               ) AS last_station_id,
                      start_station_id,
                      end_station_id
                  FROM `bigquery-public-data.san_francisco.bikeshare_trips`
                  WHERE DATE(start_date) = '2015-10-25' 
                  """

Nested and Repeated Data

query = """
        SELECT hits.page.pagePath as path,
            COUNT(hits.page.pagePath) as counts
        FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`, 
            UNNEST(hits) as hits
        WHERE hits.type="PAGE" and hits.hitNumber=1
        GROUP BY path
        ORDER BY counts DESC
        """

Writing Efficient Queries

  1. Only select the columns you want.
  2. Read less data.
  3. Avoid N:N JOINs.