###################################################################### # Options ###################################################################### NUM_REDUCE_TASKS=8 PRINT=no TICKET=`./last-ticket.sh` ###################################################################### # Configuration ###################################################################### HADOOP_VERSION=0.15.3 HADOOP_HOME=${HOME}/proj/hadoop/hadoop-${HADOOP_VERSION} TMP=/var/local/cs147a-spr08/hadoop-rshaull PYCOLORIZE=pycolorize ###################################################################### # Convenience variables ###################################################################### SRC=$(shell pwd) HADOOP=${HADOOP_HOME}/bin/hadoop STREAMING_JAR=${HADOOP_HOME}/contrib/hadoop-${HADOOP_VERSION}-streaming.jar ###################################################################### # check : run all tests # test_python : test the python code on command line # test_hadoop : run the code in your Hadoop environment # clean : clean up after both tests ###################################################################### check: test_python test_hadoop stop: ${HADOOP_HOME}/bin/stop-mapred.sh ${HADOOP_HOME}/bin/stop-dfs.sh start: ${HADOOP_HOME}/bin/start-dfs.sh ${HADOOP_HOME}/bin/start-mapred.sh reset: stop -rm -r ${HADOOP_HOME}/logs/* -rm -r ${TMP} -cat ${HADOOP_HOME}/conf/slaves | xargs -I{} ssh {} rm -r ${TMP} -${HADOOP} namenode -format docs: $(patsubst %.py,doc/code/%.py.html,$(wildcard *.py)) doc/code/Makefile doc/code/last-ticket.sh.txt clean: clean_python clean_dfs ###################################################################### # Targets required for testing in hadoop # # Assumes that Hadoop is running and that you have configured # HADOOP_HOME and HADOOP_VERSION above ###################################################################### test_hadoop: clean_dfs dfs/inputs dfs/kiosk dfs/fee_history dfs/checkout dfs/oose dfs/kiosk: mr.py kiosk-reducer-1.py kiosk-mapper-2.py kiosk-reducer-2.py -${HADOOP} dfs -rmr tmp/kiosk -${HADOOP} dfs -rmr outputs/kiosk $(call streaming,/bin/cat,,$(call mr,kiosk-reducer-1),inputs/intake inputs/discharge,tmp/kiosk) $(call streaming,$(call mr,kiosk-mapper-2),,$(call mr,kiosk-reducer-2),tmp/kiosk inputs/parking,outputs/kiosk) @$(call dfs_cat,outputs/kiosk/*) dfs/fee_history: mr.py fee-history-reducer-1.py fee-history-reducer-2.py -${HADOOP} dfs -rmr tmp/fee_history -${HADOOP} dfs -rmr outputs/fee_history $(call streaming,/bin/cat,,$(call mr,fee-history-reducer-1),inputs/intake inputs/discharge,tmp/fee_history) $(call streaming,/bin/cat,,$(call mr,fee-history-reducer-2),tmp/fee_history,outputs/fee_history) @$(call dfs_cat,outputs/fee_history/*) dfs/checkout: mr.py checkout-mapper.py outputs/kiosk -${HADOOP} dfs -rmr outputs/checkout $(call streaming,$(call mr,checkout-mapper ${TICKET}),,NONE,outputs/kiosk,outputs/checkout) @$(call dfs_cat,outputs/checkout/*) dfs/oose: mr.py oose-mapper.py oose-reducer.py -${HADOOP} dfs -rmr outputs/oose $(call streaming,$(call mr,oose-mapper),$(call mr,oose-reducer),$(call mr,oose-reducer),outputs/fee_history,outputs/oose) @$(call dfs_cat,outputs/oose/*) dfs/inputs: -${HADOOP} dfs -put inputs inputs # okay if fails, already in dfs clean_dfs: -${HADOOP} dfs -rmr inputs -${HADOOP} dfs -rmr tmp -${HADOOP} dfs -rmr outputs ###################################################################### # Targets required for command-line python tests # # You can run all command-line tests with target test_python ###################################################################### test_python: clean_python \ outputs/kiosk outputs/fee_history outputs/checkout outputs/oose outputs: -mkdir outputs outputs/kiosk: outputs inputs/intake inputs/discharge kiosk-reducer-1.py kiosk-mapper-2.py kiosk-reducer-2.py cat inputs/intake inputs/discharge | \ sort | \ ./mr.py kiosk-reducer-1 | \ cat - inputs/parking | \ ./mr.py kiosk-mapper-2 | \ sort | \ ./mr.py kiosk-reducer-2 >outputs/kiosk @$(call cat,outputs/kiosk) outputs/fee_history: outputs inputs/intake inputs/discharge fee-history-reducer-1.py fee-history-reducer-2.py cat inputs/intake inputs/discharge | \ sort | \ ./mr.py fee-history-reducer-1 | \ sort | \ ./mr.py fee-history-reducer-2 >outputs/fee_history @$(call cat,outputs/fee_history) outputs/checkout: checkout-mapper.py cat outputs/kiosk | \ ./mr.py checkout-mapper ${TICKET} \ >outputs/checkout @$(call cat,outputs/checkout) outputs/oose: outputs/fee_history oose-mapper.py oose-reducer.py cat outputs/fee_history | \ ./mr.py oose-mapper | \ sort | \ ./mr.py oose-reducer \ >outputs/oose @$(call cat,outputs/oose) clean_python: -rm -r outputs ###################################################################### # Useful functions for running tests and outputting results ###################################################################### # # Execute a HadoopStreaming job # streaming = ${HADOOP} jar ${STREAMING_JAR} \ ${call option,-mapper,${1}} \ ${call option,-combiner,${2}} \ ${call option,-reducer,${3}} \ ${call option,-input,${4}} \ ${call option,-output,${5}} \ -numReduceTasks ${NUM_REDUCE_TASKS} # # Set a map reduce option iff that option has a value # # Syntax: ${call option,-name,module} # # For example, if you want to use the mapper defined in the module # file my-mapper.py as the mapper, you would use this function call # as an argument to the invocation of the streaming job: # # ${call options,-mapper,my-mapper} # option = ${if ${2},${1} ${2}} # # Run a module with the mapreducer script mr # mr = "${SRC}/mr.py ${1}" # # Print action and result of action # # Printing is only done if PRINT is set to "yes" # cat = $(call print,cat $(1)) dfs_cat = $(call print,${HADOOP} dfs -cat $(1)) ifeq "${PRINT}" "yes" print = echo "==============================================================================="; \ echo "$(1)"; \ echo "-------------------------------------------------------------------------------"; \ $(1); \ echo "===============================================================================" else print = endif ###################################################################### # Housekeeping ###################################################################### doc/code: mkdir doc/code doc/code/%.py.html: doc/code %.py ${PYCOLORIZE} $*.py >doc/code/$*.py.html doc/code/Makefile: Makefile cp Makefile doc/code/Makefile doc/code/last-ticket.sh.txt: last-ticket.sh cp last-ticket.sh doc/code/last-ticket.sh.txt .PHONY: reset clean clean_dfs clean_python dfs/* test_python test_hadoop