Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Migration of unmigrated content due to installation of a new plugin
Wiki Markup
To make it easier for people to understand the "Query Phrase Popularity" example in the tutorial, I added comment lines to script1-local.pig to show samples from the intermediate relations.  Should this be checked in?

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

-- Query Phrase Popularity (local mode)

-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day.

-- Register the tutorial JAR file so that the included UDFs can be called in the script.
REGISTER /tmp/tutorial.jar;

-- Use the PigStorage function to load the excite log file into the raw bag as an array of records.
-- Input: (user,time,query) 
raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
--BED75271605EBD0C	970916001954	Yahoo chat
--BED75271605EBD0C	970916003523	Yahoo chat
--2B73EFE0F9FC9E0B      970916195507    http://educationalproducts.com
--CD37F95FC0886E1D      970916084059    www:http:/www.tti.com

-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. 
clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
--BED75271605EBD0C	970916001954	Yahoo chat
--BED75271605EBD0C	970916003523	Yahoo chat


-- Call the ToLower UDF to change the query field to lowercase. 
clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
--BED75271605EBD0C	970916001954	yahoo chat
--BED75271605EBD0C	970916003523	yahoo chat

-- Because the log file only contains queries for a single day, we are only interested in the hour.
-- The excite query log timestamp format is YYMMDDHHMMSS.
-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
--BED75271605EBD0C	00	yahoo chat
--BED75271605EBD0C	00	yahoo chat
--BED75271605EBD0C	00	yahoo chat
--BED75271605EBD0C	00	yahoo chat
--BED75271605EBD0C	00	yahoo chat
--BED75271605EBD0C	00	yahoo chat

-- Call the NGramGenerator UDF to compose the n-grams of the query.
ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
--BED75271605EBD0C        00      chat
--BED75271605EBD0C        00      yahoo
--BED75271605EBD0C        00      yahoo chat
--BED75271605EBD0C        00      chat
--BED75271605EBD0C        00      yahoo
--BED75271605EBD0C        00      yahoo chat 

-- Use the DISTINCT command to get the unique n-grams for all records.
ngramed2 = DISTINCT ngramed1;
--BED75271605EBD0C        00      chat
--BED75271605EBD0C        00      yahoo
--BED75271605EBD0C        00      yahoo chat

-- Use the GROUP command to group records by n-gram and hour. 
hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
--(chat,00)	{(BED75271605EBD0C,00,chat)}
--(yahoo chat,00)	{(BED75271605EBD0C,00,yahoo chat)}
--(yahoo,00)	{(BED75271605EBD0C,00,yahoo)}

-- Use the COUNT function to get the count (occurrences) of each n-gram. 
hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
--yahoo	00	1
--chat	00	1
--yahoo chat	00	1

-- Use the GROUP command to group records by n-gram only. 
-- Each group now corresponds to a distinct n-gram and has the count for each hour.
uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
--yahoo	{(yahoo,04,1),(yahoo,00,1),(yahoo,01,1),(yahoo,02,1),(yahoo,03,1),(yahoo,09,1),(yahoo,10,1),(yahoo,19,1),(yahoo,20,1)}
--chat	{(chat,00,1),(chat,01,1),(chat,02,1),(chat,03,1),(chat,04,2),(chat,05,1),(chat,06,1),(chat,07,1),(chat,08,1),(chat,09,1),(chat,13,1),(chat,17,3),(chat,19,2),(chat,20,1)}
--yahoo chat	{(yahoo chat,00,1),(yahoo chat,01,1),(yahoo chat,02,1),(yahoo chat,03,1),(yahoo chat,04,1),(yahoo chat,09,1),(yahoo chat,19,1),(yahoo chat,20,1)}

-- For each group, identify the hour in which this n-gram is used with a particularly high frequency.
-- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram.
uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
--chat	19	1.2126781251816656	2	1.2857142857142854
--chat	04	1.2126781251816656	2	1.2857142857142854
--chat	17	2.9104275004359965	3	1.2857142857142854
--new     07      2.4494897427831788      2       1.1428571428571426
--the     08      1.5895540678349904      4       1.9375
--the     09      0.0481683050859088      2       1.9375

-- Use the FOREACH-GENERATE command to assign names to the fields. 
uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
--10      the     0.0481683050859088      2       1.9375
--19      chat    1.2126781251816656      2       1.2857142857142854
--04      chat    1.2126781251816656      2       1.2857142857142854
--17      chat    2.9104275004359965      3       1.2857142857142854
--14      city    2.2360679774997902      2       1.1666666666666665


-- Use the FILTER command to move all records with a score less than or equal to 2.0.
filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
--08      s       2.545584412271571       3       1.3636363636363635
--19      in      2.1572774865200244      3       1.4285714285714284
--11      in      2.1572774865200244      3       1.4285714285714284
--10      to      2.6457513110645903      2       1.125
--19      car     2.23606797749979        3       1.3333333333333333
---...

-- Use the ORDER command to sort the remaining records by hour and score. 
ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
--07	new	2.4494897427831788	2	1.1428571428571426
--08	pictures	2.04939015319192	3	1.4999999999999998
--08	computer	2.4494897427831788	2	1.1428571428571426
--08	s	2.545584412271571	3	1.3636363636363635
--10	free	2.2657896674010605	4	1.923076923076923

-- Use the PigStorage function to store the results. 
-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
STORE ordered_uniq_frequency INTO 'script1-local-results.out' USING PigStorage();