1. 大数据mapper书写范式hdfs
import json
import sysdef read_input(input_stream):for line in input_stream:yield line.rstrip('\n')def load_json_data(json_line):try:data = json.loads(json_line)unique_id = data.get('id')combined_content = ' '.join([data.get('title', ''), data.get('text', '')])return unique_id, combined_contentexcept json.JSONDecodeError:return None, Nonedef mapper(input_stream, output_stream=sys.out):processed_ids = set()for json_line in read_input(input_stream):id, text = load_json_data(json_line)if filter():output_stream.write(json_line + "\n")processed_ids.add(id)
def getKeywords():pass
if __name__ == "main":mapper(sys.stdin)