<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:media="http://search.yahoo.com/mrss/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>大数据技术 - 四号程序员</title>
	<atom:link href="https://www.coder4.com/archives/category/big-data/feed" rel="self" type="application/rss+xml" />
	<link>https://www.coder4.com</link>
	<description>Keep It Simple and Stupid</description>
	<lastBuildDate>Wed, 27 Aug 2025 12:57:52 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=6.8.3</generator>
	<item>
		<title>Roaring Bitmaps 的基本原理</title>
		<link>https://www.coder4.com/archives/8012</link>
					<comments>https://www.coder4.com/archives/8012#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Wed, 06 Dec 2023 04:32:24 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[Roaring Bitmaps]]></category>
		<guid isPermaLink="false">https://www.coder4.com/?p=8012</guid>

					<description><![CDATA[https://cloud.tencent.com/developer/article/1136054 很多的to_bitmap都是用这个数据结构实现的 [......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/8012/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>[转]深度对比 Delta、Iceberg 和 Hudi 三大开源数据湖方案</title>
		<link>https://www.coder4.com/archives/7819</link>
					<comments>https://www.coder4.com/archives/7819#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Fri, 24 Feb 2023 04:58:40 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[数据湖]]></category>
		<guid isPermaLink="false">https://www.coder4.com/?p=7819</guid>

					<description><![CDATA[https://www.infoq.cn/article/fjebconxd2sz9wloykfo[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/7819/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>Flink双流join原理</title>
		<link>https://www.coder4.com/archives/7814</link>
					<comments>https://www.coder4.com/archives/7814#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Wed, 22 Feb 2023 10:26:18 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[flink]]></category>
		<category><![CDATA[join]]></category>
		<guid isPermaLink="false">https://www.coder4.com/?p=7814</guid>

					<description><![CDATA[https://developer.huawei.com/consumer/cn/forum/topic/0202775562683000448[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/7814/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>Presto获取当天、昨天的格式化日期字符串</title>
		<link>https://www.coder4.com/archives/7795</link>
					<comments>https://www.coder4.com/archives/7795#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Sat, 05 Nov 2022 12:22:27 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<guid isPermaLink="false">https://www.coder4.com/?p=7795</guid>

					<description><![CDATA[format_datetime(current_date, 'YYYY-MM-dd'), format_datetime(DATE_ADD('day', -1, current_date), 'YYYY-MM-dd') format_datetime(DATE_ADD('day', -2, current_date), 'YYYY-MM-dd') &#160;[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/7795/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>定制Hadoop的MapReduce任务的FileOutputFormat</title>
		<link>https://www.coder4.com/archives/7121</link>
					<comments>https://www.coder4.com/archives/7121#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Thu, 12 Nov 2020 09:33:09 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[FileOutputFormat]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[Map]]></category>
		<category><![CDATA[Reduce]]></category>
		<category><![CDATA[定制]]></category>
		<guid isPermaLink="false">https://www.coder4.com/?p=7121</guid>

					<description><![CDATA[需求：Reduce输出特殊的格式结果 例如：如Reducer的结果，压到Guava的BloomFilter中 import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnels; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.h[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/7121/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>Hadoop如何快速完成数值排序的工作</title>
		<link>https://www.coder4.com/archives/4608</link>
					<comments>https://www.coder4.com/archives/4608#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Wed, 08 Apr 2015 10:28:17 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<guid isPermaLink="false">http://www.coder4.com/?p=4608</guid>

					<description><![CDATA[转载自：http://stackoverflow.com/questions/13331722/how-to-sort-numerically-in-hadoops-shuffle-sort-phase Assuming you are using Hadoop Streaming, you need to use the KeyFieldBasedComparator class. -D mapred.output.key.comparator.class=org.apach[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/4608/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>Writing Hive Custom Aggregate Functions (UDAF)</title>
		<link>https://www.coder4.com/archives/4379</link>
					<comments>https://www.coder4.com/archives/4379#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Tue, 20 Jan 2015 04:00:31 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[Hive]]></category>
		<category><![CDATA[udaf]]></category>
		<guid isPermaLink="false">http://www.coder4.com/?p=4379</guid>

					<description><![CDATA[转载自：《Writing Hive Custom Aggregate Functions (UDAF): Part II》 Now that we got eclipse configured (see Part I) for UDAF development, its time to write our first UDAF. Searching for custom UDAF, most people might have already came across the followi[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/4379/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>Hive自定义UDF/UDAF/UDTF中，如何获得List的ObjectInspector</title>
		<link>https://www.coder4.com/archives/4331</link>
					<comments>https://www.coder4.com/archives/4331#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Tue, 25 Nov 2014 06:14:17 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[Hive]]></category>
		<category><![CDATA[list]]></category>
		<category><![CDATA[ObjectInspector]]></category>
		<guid isPermaLink="false">http://www.coder4.com/?p=4331</guid>

					<description><![CDATA[在Hive中，在使用GenercU**F实现自定义UDF/UDAF/UDTF时，经常要制定输出类型，其中要获得一个ObjectInspector。 对于基础类型： PrimitiveObjectInspectorFactory.javaStringObjectInspector) 对于List等复合类型，要2步： ObjectInspectorFactory .getStandardListObjectInspector(PrimitiveObjectInspectorFa[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/4331/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
		<item>
		<title>如何拓展Hadoop的InputFormat为其他分隔符</title>
		<link>https://www.coder4.com/archives/4313</link>
					<comments>https://www.coder4.com/archives/4313#comments</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Sat, 08 Nov 2014 07:58:39 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[InputFormat]]></category>
		<category><![CDATA[分隔符]]></category>
		<guid isPermaLink="false">http://www.coder4.com/?p=4313</guid>

					<description><![CDATA[在Hadoop中，常用的TextInputFormat是以换行符作为Record分隔符的。 在实际应用中，我们经常会出现一条Record中包含多行的情况，例如： &#60;doc&#62; .... &#60;/doc&#62; 此时，需要拓展TextInputFormat以完成这个功能。 先来看一下原始实现： public class TextInputFormat extends FileInputFormat&#60;LongWritable, Text&#62; {[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/4313/feed</wfw:commentRss>
			<slash:comments>1</slash:comments>
		
		
			</item>
		<item>
		<title>Hive中找出Table和Partition的真实路径。</title>
		<link>https://www.coder4.com/archives/4272</link>
					<comments>https://www.coder4.com/archives/4272#respond</comments>
		
		<dc:creator><![CDATA[coder4]]></dc:creator>
		<pubDate>Tue, 05 Aug 2014 09:14:11 +0000</pubDate>
				<category><![CDATA[大数据技术]]></category>
		<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[HDFS]]></category>
		<category><![CDATA[Hive]]></category>
		<category><![CDATA[partition]]></category>
		<category><![CDATA[table]]></category>
		<category><![CDATA[路径]]></category>
		<guid isPermaLink="false">http://www.coder4.com/?p=4272</guid>

					<description><![CDATA[在Hive中，如果使用了External Table或者Partition，那么路径是不在自己的hive warehouse下的。 -- 获取table的真实hdfs路径 desc formatted my_table; -- 获取partition的真实hdfs路径 desc formatted my_table (pt='20140804'); &#160;[......] 继续阅读]]></description>
		
					<wfw:commentRss>https://www.coder4.com/archives/4272/feed</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
			</item>
	</channel>
</rss>
