将IMDB导入PostgreSQL数据库
step1
下载IMDB数据集,建议下载2013版本,数据量较小,方便测试。
step2
建表
CREATE TABLE aka_name (
id integer NOT NULL PRIMARY KEY,
person_id integer NOT NULL,
name character varying,
imdb_index character varying(3),
name_pcode_cf character varying(11),
name_pcode_nf character varying(11),
surname_pcode character varying(11),
md5sum character varying(65)
);
CREATE TABLE aka_title (
id integer NOT NULL PRIMARY KEY,
movie_id integer NOT NULL,
title character varying,
imdb_index character varying(4),
kind_id integer NOT NULL,
production_year integer,
phonetic_code character varying(5),
episode_of_id integer,
season_nr integer,
episode_nr integer,
note character varying(72),
md5sum character varying(32)
);
CREATE TABLE cast_info (
id integer NOT NULL PRIMARY KEY,
person_id integer NOT NULL,
movie_id integer NOT NULL,
person_role_id integer,
note character varying,
nr_order integer,
role_id integer NOT NULL
);
CREATE TABLE char_name (
id integer NOT NULL PRIMARY KEY,
name character varying NOT NULL,
imdb_index character varying(2),
imdb_id integer,
name_pcode_nf character varying(5),
surname_pcode character varying(5),
md5sum character varying(32)
);
CREATE TABLE comp_cast_type (
id integer NOT NULL PRIMARY KEY,
kind character varying(32) NOT NULL
);
CREATE TABLE company_name (
id integer NOT NULL PRIMARY KEY,
name character varying NOT NULL,
country_code character varying(6),
imdb_id integer,
name_pcode_nf character varying(5),
name_pcode_sf character varying(5),
md5sum character varying(32)
);
CREATE TABLE company_type (
id integer NOT NULL PRIMARY KEY,
kind character varying(32)
);
CREATE TABLE complete_cast (
id integer NOT NULL PRIMARY KEY,
movie_id integer,
subject_id integer NOT NULL,
status_id integer NOT NULL
);
CREATE TABLE info_type (
id integer NOT NULL PRIMARY KEY,
info character varying(32) NOT NULL
);
CREATE TABLE keyword (
id integer NOT NULL PRIMARY KEY,
keyword character varying NOT NULL,
phonetic_code character varying(5)
);
CREATE TABLE kind_type (
id integer NOT NULL PRIMARY KEY,
kind character varying(15)
);
CREATE TABLE link_type (
id integer NOT NULL PRIMARY KEY,
link character varying(32) NOT NULL
);
CREATE TABLE movie_companies (
id integer NOT NULL PRIMARY KEY,
movie_id integer NOT NULL,
company_id integer NOT NULL,
company_type_id integer NOT NULL,
note character varying
);
CREATE TABLE movie_info_idx (
id integer NOT NULL PRIMARY KEY,
movie_id integer NOT NULL,
info_type_id integer NOT NULL,
info character varying NOT NULL,
note character varying(1)
);
CREATE TABLE movie_keyword (
id integer NOT NULL PRIMARY KEY,
movie_id integer NOT NULL,
keyword_id integer NOT NULL
);
CREATE TABLE movie_link (
id integer NOT NULL PRIMARY KEY,
movie_id integer NOT NULL,
linked_movie_id integer NOT NULL,
link_type_id integer NOT NULL
);
CREATE TABLE name (
id integer NOT NULL PRIMARY KEY,
name character varying NOT NULL,
imdb_index character varying(9),
imdb_id integer,
gender character varying(1),
name_pcode_cf character varying(5),
name_pcode_nf character varying(5),
surname_pcode character varying(5),
md5sum character varying(32)
);
CREATE TABLE role_type (
id integer NOT NULL PRIMARY KEY,
role character varying(32) NOT NULL
);
CREATE TABLE title (
id integer NOT NULL PRIMARY KEY,
title character varying NOT NULL,
imdb_index character varying(5),
kind_id integer NOT NULL,
production_year integer,
imdb_id integer,
phonetic_code character varying(5),
episode_of_id integer,
season_nr integer,
episode_nr integer,
series_years character varying(49),
md5sum character varying(32)
);
CREATE TABLE movie_info (
id integer NOT NULL PRIMARY KEY,
movie_id integer NOT NULL,
info_type_id integer NOT NULL,
info character varying NOT NULL,
note character varying
);
CREATE TABLE person_info (
id integer NOT NULL PRIMARY KEY,
person_id integer NOT NULL,
info_type_id integer NOT NULL,
info character varying NOT NULL,
note character varying
);
step3
导入数据
copy aka_name from 'D:/imdb/aka_name.csv' WITH(format csv,DELIMITER ',',NULL '',quote '"',escape '\');
step4
添加约束关系
-- aka_name
create index aka_name_idx_md5 on aka_name(md5sum);
create index aka_name_idx_name on aka_name(name);
create index aka_name_idx_pcode on aka_name(surname_pcode);
-- aka_title
create index aka_title_idx_epof on aka_title(episode_of_id);
create index aka_title_idx_kindid on aka_title(kind_id);
create index aka_title_idx_md5 on aka_title(md5sum);
create index aka_title_idx_movieid on aka_title(movie_id);
create index aka_title_idx_pcode on aka_title(phonetic_code);
create index aka_title_idx_title on aka_title(title);
create index aka_title_idx_year on aka_title(production_year);
-- cast_info
create index cast_info_idx_cid on cast_info (person_role_id);
create index cast_info_idx_mid on cast_info (movie_id);
create index cast_info_idx_pid on cast_info (person_id);
create index cast_info_idx_rid on cast_info (role_id);
-- char_name
create index char_name_idx_imdb_id on char_name(imdb_id);
create index char_name_idx_md5 on char_name(md5sum);
create index char_name_idx_name on char_name(name);
create index char_name_idx_pcode on char_name(surname_pcode);
create index char_name_idx_pcodenf on char_name(name_pcode_nf);
-- comp_cast_type
create index comp_cast_type_kind on comp_cast_type (kind);
-- company_name
create index company_name_idx_ccode on company_name (country_code);
create index company_name_idx_imdb_id on company_name (imdb_id);
create index company_name_idx_md5 on company_name (md5sum);
create index company_name_idx_name on company_name (name);
create index company_name_idx_pcodenf on company_name (name_pcode_nf);
create index company_name_idx_pcodesf on company_name (name_pcode_sf);
-- company_type
create index company_type_kind on company_type (kind);
-- complete_cast
create index complete_cast_idx_mid on complete_cast (movie_id);
create index complete_cast_idx_sid on complete_cast (subject_id);
-- info_type
create index complete_cast_idx_sid on complete_cast (subject_id);
-- keyword
create index keyword_idx_keyword on keyword (keyword);
create index keyword_idx_pcode on keyword (phonetic_code);
-- kind_type
create index kind_type_kind on kind_type (kind);
-- link_type
create index link_type_link on link_type (link);
-- movie_companies
create index movie_companies_idx_cid on movie_companies (company_id);
create index movie_companies_idx_ctypeid on movie_companies (company_type_id);
create index movie_companies_idx_mid on movie_companies (movie_id);
-- movie_info
create index movie_info_idx_infotypeid on movie_info (info_type_id);
create index movie_info_idx_mid on movie_info (movie_id);
-- movie_keyword
create index movie_keyword_idx_keywordid on movie_keyword (keyword_id);
create index movie_keyword_idx_mid on movie_keyword (movie_id);
-- movie_link
create index movie_link_idx_lmid on movie_link (linked_movie_id);
create index movie_link_idx_ltypeid on movie_link (link_type_id);
create index movie_link_idx_mid on movie_link (movie_id);
-- name
create index name_idx_gender on name (gender);
create index name_idx_imdb_id on name (imdb_id);
create index name_idx_md5 on name (md5sum);
create index name_idx_name on name (name);
create index name_idx_pcode on name (surname_pcode);
create index name_idx_pcodecf on name (name_pcode_cf);
create index name_idx_pcodenf on name (name_pcode_nf);
-- person_id
create index person_info_idx_itypeid on person_info (info_type_id);
create index person_info_idx_pid on person_info (person_id);
-- role_type
create index role_type_role on role_type (role);
-- title
create index title_idx_episode_nr on title (episode_nr);
create index title_idx_epof on title (episode_of_id);
create index title_idx_imdb_id on title (imdb_id);
create index title_idx_kindid on title (kind_id);
create index title_idx_md5 on title (md5sum);
create index title_idx_pcode on title (phonetic_code);
create index title_idx_season_nr on title (season_nr);
create index title_idx_title on title (title);
create index title_idx_year on title (production_year);