In Hive we can use the LOAD command to bulk load data into our tables, Load operations are currently pure copy/move operations that move datafiles into locations corresponding to Hive tables and they do not allow any transformation while loading data into tables. Base Syntax :
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
hive create table numbers (num int);
OK
Time taken: 8.602 seconds
[root@sandbox tmp]# head /tmp/file.csv
9790
9789
9788
9787
9786
9785
9784
9783
9782
9781
hive LOAD DATA LOCAL INPATH '/tmp/file.csv' INTO TABLE numbers;
Loading data to table testdb.numbers
Table testdb.numbers stats: [numFiles=1, totalSize=47844]
OK
Time taken: 2.751 seconds
hive select count(*) from numbers;
Query ID = root_20160822061422_0a7f42d2-2499-4bbf-897d-769a564ecd9e
Total jobs = 1
Launching Job 1 out of 1
Tez session was closed. Reopening...
Session re-established.
Status: Running (Executing on YARN cluster with App id application_1471842544192_0003)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 02/02 [==========================] 100% ELAPSED TIME: 8.16 s
--------------------------------------------------------------------------------
OK
9791
Time taken: 22.577 seconds, Fetched: 1 row(s)
hive LOAD DATA LOCAL INPATH '/tmp/file.csv' OVERWRITE INTO TABLE numbers;
Loading data to table testdb.numbers
Moved: 'hdfs://sandbox.hortonworks.com:8020/apps/hive/warehouse/testdb.db/numbers/file.csv' to trash at: hdfs://sandbox.hortonworks.com:8020/user/root/.Trash/Current
Table testdb.numbers stats: [numFiles=1, totalSize=47844]
OK
Time taken: 7.024 seconds
-- Count the records
hive select count(*) from numbers;
Query ID = root_20160822062816_4669a599-f0b0-4144-ae40-b2cb8d3f8093
Total jobs = 1
Launching Job 1 out of 1
Status: Running (Executing on YARN cluster with App id application_1471842544192_0005)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 02/02 [==========================] 100% ELAPSED TIME: 9.23 s
--------------------------------------------------------------------------------
OK
9791
Time taken: 14.23 seconds, Fetched: 1 row(s)
hive LOAD DATA LOCAL INPATH '/tmp/file.csv' INTO TABLE numbers;
Loading data to table testdb.numbers
Table testdb.numbers stats: [numFiles=2, totalSize=95688]
OK
Time taken: 4.184 seconds
hive select count(*) from numbers;
Query ID = root_20160822063128_3c7bce1b-7329-40e6-8ba3-f504b7f217ab
Total jobs = 1
Launching Job 1 out of 1
Status: Running (Executing on YARN cluster with App id application_1471842544192_0005)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 02/02 [==========================] 100% ELAPSED TIME: 8.76 s
--------------------------------------------------------------------------------
OK
19582
Time taken: 10.512 seconds, Fetched: 1 row(s)
[root@sandbox tmp]# ls -la file*
-rw-r--r-- 1 root root 47844 2016-08-22 06:46 file1.csv
-rw-r--r-- 1 root root 47844 2016-08-22 06:46 file2.csv
-rw-r--r-- 1 root root 47844 2016-08-22 06:06 file.csv
hive LOAD DATA LOCAL INPATH '/tmp/file*' INTO TABLE numbers;
Loading data to table testdb.numbers
Table testdb.numbers stats: [numFiles=3, totalSize=143532]
OK
Time taken: 2.466 seconds
hive CREATE TABLE testdb.part_numbers (num INT) PARTITIONED BY (num_part INT);
OK
Time taken: 1.172 seconds
hive LOAD DATA LOCAL INPATH '/tmp/file.csv' OVERWRITE INTO TABLE part_numbers;
FAILED: SemanticException [Error 10062]: Need to specify partition columns because the destination table is partitioned
hive create table numbers_staging (num int);
OK
Time taken: 1.327 seconds
hive LOAD DATA LOCAL INPATH '/tmp/file.csv' OVERWRITE INTO TABLE testdb.numbers_staging;
Loading data to table testdb.numbers_staging
Table testdb.numbers_staging stats: [numFiles=1, totalSize=22]
OK
Time taken: 9.988 seconds
hive select * from testdb.numbers_staging;
OK
1
1
2
2
2
2
33
33
33
NULL
Time taken: 2.153 seconds, Fetched: 10 row(s)
hive insert into testdb.part_numbers
PARTITION (num_part=1)
select num from testdb.numbers_staging where num=1;
Query ID = root_20160822072128_4b46b90c-10ad-4cbe-991a-80723dec102d
Total jobs = 1
Launching Job 1 out of 1
Tez session was closed. Reopening...
Session re-established.
Status: Running (Executing on YARN cluster with App id application_1471842544192_0008)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 01/01 [==========================] 100% ELAPSED TIME: 6.72 s
--------------------------------------------------------------------------------
Loading data to table testdb.part_numbers partition (num_part=1)
Partition testdb.part_numbers{num_part=1} stats: [numFiles=1, numRows=2, totalSize=4, rawDataSize=2]
OK
Time taken: 23.151 seconds
-- verify table content
hive select * from testdb.part_numbers;
OK
1 1
1 1
Time taken: 0.813 seconds, Fetched: 2 row(s)
[root@sandbox tmp]# cat file.csv
33
33
33
hive LOAD DATA LOCAL INPATH '/tmp/file.csv' OVERWRITE INTO TABLE testdb.part_numbers PARTITION (num_part=3);
Loading data to table testdb.part_numbers partition (num_part=3)
Partition testdb.part_numbers{num_part=3} stats: [numFiles=1, numRows=0, totalSize=10, rawDataSize=0]
OK
Time taken: 5.388 seconds
hive select * from testdb.part_numbers;
OK
1 1
1 1
33 3
33 3
33 3
NULL 3
Time taken: 0.955 seconds, Fetched: 6 row(s)