Hive

To demonstrate Hive, below is a short tutorial. The tutorial uses the Google NGrams dataset, which is available in HDFS in /var/ngrams.

# Open the interactive hive console
hive

# Create a table with the Google NGrams data in /var/ngrams
CREATE EXTERNAL TABLE ngrams_your-uniqname(ngram STRING, year INT, count BIGINT, volumes BIGINT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘\t’
STORED AS TEXTFILE
LOCATION ‘/var/ngrams’;

# Look at the schema of the table
DESCRIBE ngrams_your-uniqname;

# Count the total number of rows (should be 1430731493)
SELECT COUNT(*) FROM ngrams_your-uniqname;

# Select the number of words, by year, that have only appeared in a single volume
SELECT year, COUNT(ngram) FROM ngrams_your-uniqname WHERE
volumes = 1
GROUP BY year;

# Optional: delete your ngrams table
DROP table ngrams_your-uniqname;

# Exit the Hive console
QUIT;

Next Post