#title Pig
[[TableOfContents]]

==== pig는 언제 쓰는게 좋을까? ====
개인적인 의견..
 * ETL 할 때
 * 데이터 흐름 언어이므로 사용자들이 데이터 분석 할 때 --> 어려울까? ㅇㅇ 어렵겠다.
 * 음.. 암턴.. 좋은거 같음..

==== 샘플 데이터 얻어서 HDFS에 넣기 ====
HADOOP 완벽가이드의 샘플 데이터 다운로드 후 HDFS에 올린다.
{{{
wget http://hanb.co.kr/exam/1746/htdg-examples-0.1.1.tar.gz
tar htdg-examples-0.1.1.tar.gz
mv htdg-examples-0.1.1 samples
cd samples
hadoop fs -put input input
}}}

==== pig 스크립트 실행과 다중 쿼리 실행과 store ====
vi store_test.pig
{{{
--store_test.pig
rmf output/1949
rmf output/1950

A = LOAD 'input/ncdc/micro-tab/sample.txt' AS (year:chararray, temperature:int, quality:int);
B = FILTER A BY year == '1949';
C = FILTER A BY year == '1950';
STORE B INTO 'output/1949';
STORE C INTO 'output/1950';

cat output/1949/part-m-00000
cat output/1950/part-m-00000
}}}
스크립트 모드로 실행하면 A로부터 2번 읽지 않는다. 

pig 스크립트를 실행한다.
{{{
pig store_test.pig
}}}

또는 그런트에서 exec나 run 유틸리티를 사용한다. exec는 배치 모드이고, run은 문장단위(;로 끝남)로 실행되는 쉘(?) 모드다. 둘의 차이는 다중쿼리시 최적화(A를 한 번만 읽는지 두 번 읽는지의 차이, exec가 한 번만 읽음)다. 

{{{
grunt> exec store_test.pig

또는

grunt> run store_test.pig
}}}

==== 스키마를 가지지 않는 릴레이션의 위치 참조 ====
{{{
B = FOREACH A GENERATE $0; --$0은 릴레이션A의 첫번째 어트리뷰트를 참조함을 의미
DUMP B;
}}}

==== sample data input ====
{{{
wget http://databaser.net/moniwiki/pds/Hive_ec_98_88_ec_a0_9c_ed_8c_8c_ec_9d_bc/data.zip
unzip data.zip
hadoop fs -mkdir scott
hadoop fs -put dept.csv scott
hadoop fs -put emp.csv scott
hadoop fs -put salgrade.csv scott
pig

emp = load 'scott/emp.csv' using PigStorage(',') as (empno, ename, job, mgr, hiredate, sal, comm, deptno:int);
grouped = group emp by deptno;
total = foreach grouped generate group, SUM(emp.sal) as total_sal;
--total = foreach grouped generate emp.depno, SUM(emp.sal) as total_sal;

dept = load 'scott/dept.csv' using PigStorage(',') as (dname, loc, deptno:int);
join_data = join total by group left, dept by deptno;
view = foreach join_data generate $0, $3, $1;
dump view;

emp = load 'scott/emp.csv' using PigStorage(',') as (empno, ename, job, mgr, hiredate, sal, comm, deptno:int);
emp = foreach emp generate ename, sal;
filtered_set = filter emp by sal >= 2000;
sorted_set = order filtered_set by sal desc;
top3 = limit sorted_set 3;
dump top3;
}}}
==== 참고자료 ====
 * https://cwiki.apache.org/confluence/display/PIG/PigTutorial
 * http://julingks.tistory.com/143 --> 한들 메뉴얼 ver 0.7
 * [http://pig.apache.org/docs/r0.7.0/piglatin_ref2.html Pig Latin Reference Manual 2]
 * [http://chimera.labs.oreilly.com/books/1234000001811/index.html Programming Pig] [http://it-ebooks.info/go.php?id=1485-1382409881-efd995b8a53fe7b78df3e20973bbd739 다운로드]
 * [http://www.i-programmer.info/news/84/3628.html DataFu for Pig and Hadoop] 
 * https://github.com/linkedin/datafu