diff --git a/gpl/toolkit/pl.py b/gpl/toolkit/pl.py index a4e7d71..c24ed89 100644 --- a/gpl/toolkit/pl.py +++ b/gpl/toolkit/pl.py @@ -1,6 +1,8 @@ from sentence_transformers import CrossEncoder from .dataset import HardNegativeDataset from torch.utils.data import DataLoader +from sentence_transformers import SentenceTransformer +from transformers import AutoTokenizer import tqdm import os import logging @@ -15,16 +17,41 @@ def hard_negative_collate_fn(batch): class PseudoLabeler(object): - def __init__(self, generated_path, gen_queries, corpus, total_steps, batch_size, cross_encoder, sep=' '): + def __init__(self, generated_path, gen_queries, corpus, total_steps, batch_size, cross_encoder, max_seq_length): assert 'hard-negatives.jsonl' in os.listdir(generated_path) fpath_hard_negatives = os.path.join(generated_path, 'hard-negatives.jsonl') self.cross_encoder = CrossEncoder(cross_encoder) - hard_negative_dataset = HardNegativeDataset(fpath_hard_negatives, gen_queries, corpus, sep) + hard_negative_dataset = HardNegativeDataset(fpath_hard_negatives, gen_queries, corpus) self.hard_negative_dataloader = DataLoader(hard_negative_dataset, shuffle=True, batch_size=batch_size, drop_last=True) self.hard_negative_dataloader.collate_fn = hard_negative_collate_fn self.output_path = os.path.join(generated_path, 'gpl-training-data.tsv') self.total_steps = total_steps + + #### retokenization + self.retokenizer = AutoTokenizer.from_pretrained(cross_encoder) + self.max_seq_length = max_seq_length + def retokenize(self, texts): + ## We did this retokenization for two reasons: + ### (1) Setting the max_seq_length; + ### (2) We cannot simply use CrossEncoder(cross_encoder, max_length=max_seq_length), + ##### since the max_seq_length will then be reflected on the concatenated sequence, + ##### rather than the two sequences independently + texts = list(map(lambda text: text.strip(), texts)) + features = self.retokenizer( + texts, + padding=True, + truncation='longest_first', + return_tensors="pt", + max_length=self.max_seq_length + ) + decoded = self.retokenizer.batch_decode( + features['input_ids'], + skip_special_tokens=True, + clean_up_tokenization_spaces=True + ) + return decoded + def run(self): # header: 'query_id', 'positive_id', 'negative_id', 'pseudo_label_margin' data = [] @@ -39,11 +66,13 @@ def run(self): batch = next(hard_negative_iterator) (query_id, pos_id, neg_id), (query, pos, neg) = batch + query, pos, neg = [self.retokenize(texts) for texts in [query, pos, neg]] scores = self.cross_encoder.predict( list(zip(query, pos)) + list(zip(query, neg)), show_progress_bar=False ) labels = scores[:len(query)] - scores[len(query):] + labels = labels.tolist() # Using `tolist` will keep more precision digits!!! batch_gpl = map(lambda quad: '\t'.join((*quad[:3], str(quad[3]))) + '\n', zip(query_id, pos_id, neg_id, labels)) data.extend(batch_gpl) diff --git a/gpl/train.py b/gpl/train.py index e9d2e1a..7c14db4 100644 --- a/gpl/train.py +++ b/gpl/train.py @@ -91,7 +91,7 @@ def train( logger.info('Using existing GPL-training data') else: logger.info('No GPL-training data found. Now generating it via pseudo labeling') - pseudo_labeler = PseudoLabeler(path_to_generated_data, gen_queries, corpus, gpl_steps, batch_size_gpl, cross_encoder, sep) + pseudo_labeler = PseudoLabeler(path_to_generated_data, gen_queries, corpus, gpl_steps, batch_size_gpl, cross_encoder, max_seq_length) pseudo_labeler.run() diff --git a/sample-data/generated/fiqa/corpus.jsonl b/sample-data/generated/fiqa/corpus.jsonl index 254ccf6..dc986f6 100644 --- a/sample-data/generated/fiqa/corpus.jsonl +++ b/sample-data/generated/fiqa/corpus.jsonl @@ -1,10 +1,10 @@ -{"text": "Market capitalization is one way to represent the value of the company. So if a company has 10 million shares, which are each worth $100, then the company's market capitalization is 1 billion. Large cap companies tend to be larger and more stable. Small cap companies are smaller, which indicates higher volatility. So if you want more aggressive investments then you may want to invest in small cap companies while if you lean on the side of caution then big cap companies may be your friend.", "title": "", "_id": "460230"} -{"text": "\"I think that all else being equal, if more people have solar panels on our roof, we keep our military in FEWER countries, instead of planning on RAMPING up the number (see: Nigeria). Furthermore, the initial question wasn't whether installing solar panels is going to stop the action. **The conversation started by being about whether solar is actually cheaper when you factor in the amount used to subsidize both sources of energy. Not whether switching to solar would stop subsidies to oil.** What you did is called \"\"moving the goalposts\"\". The word 2016 doesn't show up in the article or in the study it cites. That was a typo. I understand it was about 2013, I was trying to saying that it was about the same year as yours. That was my bad. I read the whole thing originally, and I even read the study. I accept responsibility for the typo though. ANYWAYS, neither set of data is manipulated, my study just includes more things, like foreign tax credits, cleanup subsidies for both coal and oil, tar sands exemption from cleanup funds, Power Africa (a five year, 7 billion dollar program that was only ANNOUNCED in 2013). That overseas stuff that goes on is a HUGE part of the equation. And okay, so they aren't equivalent. Then your point is moot. Yes, both sides have off the book subsidies, but one side's off the book subsidies are so unfathomably bigger that it's laughable to even put them in the same category. Not to mention the costs of war in the middle east stretch beyond money (think lives), and recycling programs actually come with benefits (think smaller landfills).\"", "title": "", "_id": "257122"} -{"text": "\">Correct me if I'm wrong, You're wrong. >but weren't taxes much higher from the 50's through the 70's? Only on the high end of the scale (the TOP tax rate was higher). But for lower and middle income people, taxes were tremendously LOWER in those decades, and became significantly higher, to wit: * [FICA increased from the original 1% in circa 1937-49, to over 15% in 1990](http://www.ssa.gov/oact/progdata/taxRates.html) -- and that is taken right out your GROSS paycheck, no deductions, etc. * Things like State Sales Taxes (which are known to be heavily regressive) were either non-existent or minimal (at say 1% and often more of a \"\"luxury\"\" tax) prior to the 1960's; and they have been steadily increasing since then. * Many other taxes (property, excise, tobacco & other \"\"sin\"\" taxes, etc) have increased (some many-fold) -- and virtually ALL of them are regressive in nature (being made more regressive with every increase). But probably the most egregious thing... is that the public has been successfully indoctrinated into believing the EXACT opposite of what really happened (and *both* political parties are to blame).\"", "title": "", "_id": "120306"} -{"text": "\"What do you think the problem is? I notice you edited your post after I read it. >The \"\"value\"\" of HUMAN labor is determined by supply and demand, right? I hire people right now because they are the cheapest supercomputers with arms available for me right now. No, you hire people because they have skills to do tasks with their labor. We will *always* think of new ways to be useful, new things to do, that people will want done, will value being done, and will purchase labor to get done. When we stopped using horses & buggies, all the people in those industries didn't just throw up their hands and walk away from work. *They figured out new ways to create value with their labor*, and the world went on.\"", "title": "", "_id": "35856"} -{"text": "Flowers are a great way to say thank you or to tell people how much you care about them. If you need a flower arrangement for any occasion, All Flowers and Gifts provides Calgary florists that can help you with everything that you need for your next flower arrangement.", "title": "", "_id": "214079"} -{"text": "I've got a small position at Lending Club and it's doing really well. It's been easy to use, good (not excellent) interface, easy to track the loans you've made & does a good job with notifications. Not much there for doing anything advanced with the portfolio math--I've used a separate spreadsheet to track things *as well as* use their reporting. The one catch with them I find is that when you put an order in you have to wait longer than you might expect for the process to complete--that is, even after the loan is *funded*, it still has to be *reviewed*, so the lead time before your money begins earning interest might be 2 weeks + 1 week of review time.", "title": "", "_id": "316535"} -{"text": "\"I've considered simply moving my funds to an Australian bank to \"\"lock-in\"\" the current rate, but I worry that this will put me at risk of a substantial loss (due to exchange rates, transfer fees, etc) when I move my funds back into the US in 6 months. Why move funds back? If you want to lock in current exchange rates, figure out how much money you are likely to spend in Australia for the next six months. Move just enough funds to cover that to an Australian bank. Leave the remainder in the United States (US), as your future expenses will be in US dollars. So long as you don't find some major, unanticipated purchase, this covers you. You have enough money for the next six months with no exchange rate worries. At the end of the six months, if you fall slightly short, cover with your credit card as you are doing now. You'll take a loss, but on a small amount of money. If you have a slight excess and you were right about the exchange rate, you'll make a little profit at the end. If you were wrong, you'll take a small loss. The key here is that you should be able to budget for your six months. You can lock in current exchange rates just for that amount of money. Moving all your funds to Australia is a gamble. You can certainly do that if you want, but rather than gambling, it may be better to take the sure thing. You know you need six months expenses, so just move that. You will definitely be spending six months money in Australia, so you are immune to exchange rate fluctuations for that period. The remainder of your money can stay in the US, as that's where you plan to spend it. However, recent political events back in the States have me (and, I'm sure, every currency speculator and foreign investor) worried that this advantage will not last for much longer. If currency speculators expect exchange rates to fall, then they'd have already bid down the rates. I.e. they'd keep speculating until the rates did fall. So the speculators expect the current rates are correct, otherwise they'd move them. Donald Trump's state goal is to increase exports relative to imports. If he's successful, this could cause the US dollar to fall to make exports cheaper and imports more expensive. However, if his policies fail, then the opposite is likely to happen. Most of his announced trade policies are more likely to increase the value of the dollar than to decrease it. In particular, that is the likely result of increased tariffs. If you are worried about Trump failing, then you should worry about a strong dollar. That's more in line with actual speculation since the election. I don't know that I'd make a strong bet in either direction. Hedging makes more sense to me, as it simply locks in the current situation, which you apparently find favorable. Not hedging at all might produce some profit if the dollar goes up. Gambling all your funds might produce some profit if the dollar goes down. The middle path of hedging just what you're spending is the safest if least likely to produce profit. My recommendation is to hedge the six months expenses and enjoy your time abroad. Why worry about political events that you can't control? Enjoy your working (studying) vacation.\"", "title": "", "_id": "511432"} -{"text": "You can lookup SWIFT codes here. Based on the search I conducted on March 30, 2015, PayPay US's SWIFT is PPALUS66, and PayPal Europe's SWIFT is PPLXLULL. Since they have two listed, it would be safe to contact PayPal directly and ask which SWIFT they would like to be used.", "title": "", "_id": "78297"} -{"text": "\">Mike Pence Sums up Trumpcare Perfectly: \"\"No Money, No Healthcare\"\" He never fucking said that. Here's his actual tweet: >Before summer\u2019s out, we'll repeal/replace Obamacare w/ system based on personal responsibility, free-market competition & state-based reform If you want to draw \"\"No Money, No Healthcare\"\" as a conclusion, fine, but you can't assign it as a quotation to Mike Pence if he never said it. I have no love for Mike Pence, but I do respect the laws of punctuation and basic journalistic integrity. If someone is really that bad, which he likely is, then you should be able to damn him for what he actually says or does. You shouldn't have to resort to simply making shit up.\"", "title": "", "_id": "182744"} -{"text": "\"Take a look at FolioFN - they let you buy small numbers of shares and fractional shares too. There is an annual fee on the order of US$100/year. You can trade with no fees at two \"\"windows\"\" per day, or at any time for a $15 fee. You are better off leaving the stock in broker's name, especially if you live overseas. Otherwise you will receive your dividends in the form of cheques that might be expensive to try to cash. There is also usually a fee charged by the broker to obtain share certificates instead of shares in your account.\"", "title": "", "_id": "281423"} +{"text": "If the APR is an effective rate. If the APR is a nominal rate compounded monthly, first convert it to an effective rate.", "title": "", "_id": "454072"} +{"text": "I'm in America on student visa and I can have 12 months of paid internship. I'd rather not waste the 12 months on a sophomore internship, you know? Of course, I'd still love to be paid for what I do and think that might make me try harder.", "title": "", "_id": "150252"} +{"text": "Cant tell if sarcasm but between two employees where one has a cellphone and one dosnt (or only intermittently) who are you going to rely on. That being said it's cheap as fuck to afford a basic ass cellphone.", "title": "", "_id": "375658"} +{"text": "I agree that religion shouldn't be protected. Let's get rid of this protection for Christians crap and let those little snowflakes realize that they don't get to push their make believe on others. Disability, the unifying factor is *the disability.* There's some non-able-bodied component. Sexual orientation is biological.", "title": "", "_id": "93353"} +{"text": "Some do internships, but most don't. I would also try to get really involved with your school's entrepreneurship center if it has one. People on reddit often stress that it's wrong to participate in unpaid internships, however in this sector I strongly disagree. If you are able to secure an unpaid internship in VC, I highly suggest pursuing it.", "title": "", "_id": "411906"} +{"text": "The public was sold the idea that losing manufacturing didn't matter. They were told not only do you get cheaper products. But you will get new cleaner service, and tech jobs that pay the same or better! People bought into it but 30 years later Ross Perot was proven right.", "title": "", "_id": "269846"} +{"text": "I had an amazing customer service experience with Best Buy yesterday. I walked into my local BB with a list of computer components for a PC I am building (I was purchasing from New Egg and Best Buy). Wandered around and an employee saw me (he ended up being the manager), and he helped me put together an order, not only that but he checked New Egg to make sure I was getting competitive prices, not only that but he made sure I was getting free shipping. On top of that he went through my New Egg list and price matched anything they had. Now I only need a couple more things from New Egg. Thought I'd share.", "title": "", "_id": "523755"} +{"text": "Group RESPs are a bit like a true mutual insurance company. You all pay into the fund, and then, depending on the number of kids that are in school that particular year, you get paid a certain amount. Advantages could be that if you end up with one or two years of only your kid in school and nobody else's in that age bracket, you get more money. Disadvantage for the same-reverse reason also could be true. Another advantage of regular programs, unlike pooled, is that if you do not use all the money, then some/all of the remaining funds may be transferable to an RRSP. Personally I would not invest in one, unless it was more like a specific investment-club that I knew everybody.", "title": "", "_id": "122794"} +{"text": "Mutual funds buy (and sell) shares in companies in accordance with the policies set forth in their prospectus, not according to the individual needs of an investor, that is, when you invest money in (or withdraw money from) a mutual fund, the manager buys or sells whatever shares that, in the manager's judgement, will be the most appropriate ones (consistent with the investment policies). Thus, a large-cap mutual fund manager will not buy the latest hot small-cap stock that will likely be hugely profitable; he/she must choose only between various large capitalization companies. Some exchange-traded funds are fixed baskets of stocks. Suppose you will not invest in a company X as a matter of principle. Unless a mutual fund prospectus says that it will not invest in X, you may well end up having an investment in X at some time because the fund manager bought shares in X. With such an ETF, you know what is in the basket, and if the basket does not include stock in X now, it will not own stock in X at a later date. Some exchange-traded funds are constructed based on some index and track the index as a matter of policy. Thus, you will not be investing in X unless X becomes part of the index because Standard or Poor or Russell or somebody changed their minds, and the ETF buys X in order to track the index. Finally, some ETFs are exactly like general mutual funds except that you can buy or sell ETF shares at any time at the price at the instant that your order is executed whereas with mutual funds, the price of the mutual fund shares that you have bought or sold is the NAV of the mutual fund shares for that day, which is established based on the closing prices at the end of the trading day of the stocks, bonds etc that the fund owns. So, you might end up owning stock in X at any time based on what the fund manager thinks about X.", "title": "", "_id": "479420"} +{"text": "You're going to find a lot of conflicting or vague answers on the internet because there are a lot of plan design elements that are set by the plan sponsor (employer). There are laws that mandate certain elements and dictate certain requirements of plan sponsors, many of these laws are related to record keeping and fiduciary duty. There is a lot of latitude for plan sponsors to allow or restrict employee actions even if there is no law against that activity. There are different rules mandated for employee pre-tax contributions, employee post-tax contributions, and employer contributions. You have more flexibility with regard to the employer contributions and any post tax contributions you may have made; your plan may allow an in-service distribution of those two items before you reach age 59.5. While your HR department (like most -all- HR departments) is not staffed with ERISA attorneys and CPAs it is your HR department and applicable plan documents that will lay out what an employee is permitted to do under the plan.", "title": "", "_id": "585889"} diff --git a/sample-data/generated/fiqa/gpl-training-data.tsv b/sample-data/generated/fiqa/gpl-training-data.tsv index aac87bc..e44a2c4 100644 --- a/sample-data/generated/fiqa/gpl-training-data.tsv +++ b/sample-data/generated/fiqa/gpl-training-data.tsv @@ -1,20 +1,20 @@ -genQ5 257122 460230 18.465141 -genQ19 511432 257122 15.421183 -genQ26 182744 214079 19.373644 -genQ3 460230 511432 18.806072 -genQ15 214079 460230 12.678501 -genQ10 35856 257122 17.834743 -genQ28 281423 120306 17.63448 -genQ29 281423 257122 17.699568 -genQ21 511432 35856 14.623292 -genQ7 120306 257122 13.6840515 -genQ24 78297 281423 19.405273 -genQ23 78297 120306 17.731052 -genQ1 460230 316535 20.700878 -genQ4 257122 182744 7.277392 -genQ16 316535 120306 10.933559 -genQ6 257122 316535 1.4065981 -genQ11 35856 257122 11.795601 -genQ20 511432 316535 14.831335 -genQ2 460230 78297 5.676446 -genQ30 281423 78297 17.271576 +genQ28 585889 269846 9.46237564086914 +genQ7 375658 122794 15.471757888793945 +genQ12 93353 454072 19.189910888671875 +genQ3 454072 375658 18.5987491607666 +genQ29 585889 269846 11.639785766601562 +genQ2 454072 523755 21.046403884887695 +genQ11 93353 269846 17.41644287109375 +genQ22 122794 411906 17.80188751220703 +genQ24 122794 93353 16.20846939086914 +genQ17 269846 479420 14.049872398376465 +genQ20 523755 269846 9.419671058654785 +genQ25 479420 269846 15.41370677947998 +genQ19 523755 269846 2.478337287902832 +genQ16 269846 585889 16.2314453125 +genQ5 150252 523755 15.29780387878418 +genQ21 523755 375658 11.556497573852539 +genQ27 479420 523755 11.202600479125977 +genQ18 269846 122794 17.46649742126465 +genQ4 150252 122794 18.039897918701172 +genQ6 150252 523755 15.562084197998047 diff --git a/sample-data/generated/fiqa/hard-negatives.jsonl b/sample-data/generated/fiqa/hard-negatives.jsonl index 8703a33..dcc2b82 100644 --- a/sample-data/generated/fiqa/hard-negatives.jsonl +++ b/sample-data/generated/fiqa/hard-negatives.jsonl @@ -1,30 +1,30 @@ -{"qid": "genQ1", "pos": ["460230"], "neg": {"msmarco-distilbert-base-v3": ["281423", "257122", "316535", "35856", "511432", "182744", "120306", "78297", "214079"], "msmarco-MiniLM-L-6-v3": ["281423", "316535", "257122", "78297", "511432", "182744", "35856", "120306", "214079"]}} -{"qid": "genQ2", "pos": ["460230"], "neg": {"msmarco-distilbert-base-v3": ["35856", "214079", "182744", "257122", "316535", "281423", "511432", "78297", "120306"], "msmarco-MiniLM-L-6-v3": ["316535", "281423", "78297", "214079", "35856", "257122", "511432", "120306", "182744"]}} -{"qid": "genQ3", "pos": ["460230"], "neg": {"msmarco-distilbert-base-v3": ["257122", "182744", "316535", "281423", "35856", "511432", "78297", "120306", "214079"], "msmarco-MiniLM-L-6-v3": ["182744", "257122", "511432", "35856", "281423", "316535", "78297", "120306", "214079"]}} -{"qid": "genQ4", "pos": ["257122"], "neg": {"msmarco-distilbert-base-v3": ["316535", "281423", "511432", "35856", "460230", "214079", "182744", "120306", "78297"], "msmarco-MiniLM-L-6-v3": ["35856", "214079", "511432", "182744", "120306", "316535", "460230", "78297", "281423"]}} -{"qid": "genQ5", "pos": ["257122"], "neg": {"msmarco-distilbert-base-v3": ["182744", "511432", "281423", "316535", "460230", "35856", "214079", "120306", "78297"], "msmarco-MiniLM-L-6-v3": ["120306", "214079", "182744", "316535", "35856", "511432", "460230", "281423", "78297"]}} -{"qid": "genQ6", "pos": ["257122"], "neg": {"msmarco-distilbert-base-v3": ["35856", "511432", "120306", "182744", "316535", "460230", "281423", "78297", "214079"], "msmarco-MiniLM-L-6-v3": ["120306", "35856", "460230", "511432", "182744", "316535", "78297", "214079", "281423"]}} -{"qid": "genQ7", "pos": ["120306"], "neg": {"msmarco-distilbert-base-v3": ["257122", "78297", "182744", "511432", "281423", "316535", "460230", "35856", "214079"], "msmarco-MiniLM-L-6-v3": ["257122", "182744", "35856", "78297", "511432", "316535", "214079", "281423", "460230"]}} -{"qid": "genQ8", "pos": ["120306"], "neg": {"msmarco-distilbert-base-v3": ["257122", "511432", "182744", "460230", "316535", "281423", "35856", "78297", "214079"], "msmarco-MiniLM-L-6-v3": ["511432", "316535", "257122", "78297", "281423", "460230", "35856", "182744", "214079"]}} -{"qid": "genQ9", "pos": ["120306"], "neg": {"msmarco-distilbert-base-v3": ["182744", "257122", "511432", "281423", "460230", "316535", "35856", "214079", "78297"], "msmarco-MiniLM-L-6-v3": ["257122", "511432", "35856", "182744", "316535", "214079", "78297", "281423", "460230"]}} -{"qid": "genQ10", "pos": ["35856"], "neg": {"msmarco-distilbert-base-v3": ["460230", "281423", "214079", "316535", "257122", "511432", "182744", "78297", "120306"], "msmarco-MiniLM-L-6-v3": ["257122", "316535", "460230", "78297", "214079", "281423", "511432", "182744", "120306"]}} -{"qid": "genQ11", "pos": ["35856"], "neg": {"msmarco-distilbert-base-v3": ["214079", "182744", "460230", "257122", "316535", "511432", "281423", "78297", "120306"], "msmarco-MiniLM-L-6-v3": ["257122", "182744", "511432", "316535", "460230", "120306", "214079", "281423", "78297"]}} -{"qid": "genQ12", "pos": ["35856"], "neg": {"msmarco-distilbert-base-v3": ["214079", "460230", "182744", "257122", "511432", "316535", "281423", "78297", "120306"], "msmarco-MiniLM-L-6-v3": ["257122", "182744", "511432", "316535", "120306", "460230", "214079", "281423", "78297"]}} -{"qid": "genQ13", "pos": ["214079"], "neg": {"msmarco-distilbert-base-v3": ["257122", "511432", "35856", "182744", "460230", "120306", "316535", "281423", "78297"], "msmarco-MiniLM-L-6-v3": ["257122", "511432", "316535", "35856", "182744", "281423", "120306", "460230", "78297"]}} -{"qid": "genQ14", "pos": ["214079"], "neg": {"msmarco-distilbert-base-v3": ["35856", "316535", "460230", "78297", "182744", "511432", "257122", "281423", "120306"], "msmarco-MiniLM-L-6-v3": ["460230", "281423", "182744", "257122", "511432", "120306", "316535", "35856", "78297"]}} -{"qid": "genQ15", "pos": ["214079"], "neg": {"msmarco-distilbert-base-v3": ["35856", "460230", "78297", "316535", "182744", "281423", "511432", "257122", "120306"], "msmarco-MiniLM-L-6-v3": ["257122", "78297", "460230", "120306", "316535", "182744", "511432", "35856", "281423"]}} -{"qid": "genQ16", "pos": ["316535"], "neg": {"msmarco-distilbert-base-v3": ["511432", "281423", "257122", "460230", "214079", "182744", "78297", "35856", "120306"], "msmarco-MiniLM-L-6-v3": ["511432", "78297", "120306", "281423", "460230", "182744", "257122", "214079", "35856"]}} -{"qid": "genQ17", "pos": ["316535"], "neg": {"msmarco-distilbert-base-v3": ["511432", "281423", "257122", "35856", "182744", "460230", "214079", "78297", "120306"], "msmarco-MiniLM-L-6-v3": ["511432", "281423", "78297", "214079", "120306", "460230", "182744", "257122", "35856"]}} -{"qid": "genQ18", "pos": ["316535"], "neg": {"msmarco-distilbert-base-v3": ["281423", "511432", "460230", "257122", "214079", "182744", "35856", "78297", "120306"], "msmarco-MiniLM-L-6-v3": ["511432", "281423", "460230", "257122", "120306", "78297", "35856", "214079", "182744"]}} -{"qid": "genQ19", "pos": ["511432"], "neg": {"msmarco-distilbert-base-v3": ["316535", "281423", "257122", "35856", "182744", "460230", "120306", "78297", "214079"], "msmarco-MiniLM-L-6-v3": ["281423", "257122", "35856", "316535", "182744", "78297", "120306", "460230", "214079"]}} -{"qid": "genQ20", "pos": ["511432"], "neg": {"msmarco-distilbert-base-v3": ["316535", "257122", "460230", "35856", "281423", "120306", "182744", "214079", "78297"], "msmarco-MiniLM-L-6-v3": ["316535", "120306", "78297", "281423", "214079", "257122", "182744", "35856", "460230"]}} -{"qid": "genQ21", "pos": ["511432"], "neg": {"msmarco-distilbert-base-v3": ["281423", "257122", "78297", "316535", "120306", "460230", "35856", "182744", "214079"], "msmarco-MiniLM-L-6-v3": ["281423", "316535", "182744", "257122", "120306", "35856", "78297", "460230", "214079"]}} -{"qid": "genQ22", "pos": ["78297"], "neg": {"msmarco-distilbert-base-v3": ["182744", "35856", "316535", "281423", "120306", "511432", "257122", "460230", "214079"], "msmarco-MiniLM-L-6-v3": ["511432", "120306", "316535", "281423", "182744", "214079", "35856", "460230", "257122"]}} -{"qid": "genQ23", "pos": ["78297"], "neg": {"msmarco-distilbert-base-v3": ["281423", "182744", "511432", "35856", "257122", "316535", "120306", "460230", "214079"], "msmarco-MiniLM-L-6-v3": ["316535", "511432", "120306", "214079", "281423", "182744", "35856", "460230", "257122"]}} -{"qid": "genQ24", "pos": ["78297"], "neg": {"msmarco-distilbert-base-v3": ["182744", "281423", "120306", "316535", "35856", "257122", "511432", "460230", "214079"], "msmarco-MiniLM-L-6-v3": ["511432", "316535", "120306", "281423", "182744", "214079", "35856", "460230", "257122"]}} -{"qid": "genQ25", "pos": ["182744"], "neg": {"msmarco-distilbert-base-v3": ["257122", "511432", "316535", "35856", "78297", "120306", "281423", "214079", "460230"], "msmarco-MiniLM-L-6-v3": ["281423", "257122", "120306", "78297", "511432", "35856", "316535", "460230", "214079"]}} -{"qid": "genQ26", "pos": ["182744"], "neg": {"msmarco-distilbert-base-v3": ["257122", "35856", "511432", "78297", "316535", "120306", "214079", "281423", "460230"], "msmarco-MiniLM-L-6-v3": ["281423", "257122", "120306", "78297", "511432", "35856", "460230", "316535", "214079"]}} -{"qid": "genQ27", "pos": ["182744"], "neg": {"msmarco-distilbert-base-v3": ["257122", "316535", "35856", "511432", "281423", "214079", "78297", "120306", "460230"], "msmarco-MiniLM-L-6-v3": ["35856", "511432", "281423", "214079", "120306", "316535", "257122", "78297", "460230"]}} -{"qid": "genQ28", "pos": ["281423"], "neg": {"msmarco-distilbert-base-v3": ["460230", "511432", "316535", "257122", "35856", "182744", "120306", "78297", "214079"], "msmarco-MiniLM-L-6-v3": ["316535", "511432", "460230", "120306", "78297", "182744", "257122", "214079", "35856"]}} -{"qid": "genQ29", "pos": ["281423"], "neg": {"msmarco-distilbert-base-v3": ["257122", "511432", "460230", "316535", "182744", "35856", "120306", "78297", "214079"], "msmarco-MiniLM-L-6-v3": ["316535", "511432", "120306", "78297", "460230", "182744", "214079", "257122", "35856"]}} -{"qid": "genQ30", "pos": ["281423"], "neg": {"msmarco-distilbert-base-v3": ["257122", "511432", "460230", "316535", "78297", "120306", "182744", "35856", "214079"], "msmarco-MiniLM-L-6-v3": ["511432", "316535", "78297", "460230", "120306", "182744", "214079", "257122", "35856"]}} +{"qid": "genQ1", "pos": ["454072"], "neg": {"msmarco-distilbert-base-v3": ["122794", "585889", "150252", "523755", "93353", "269846", "479420", "375658", "411906"], "msmarco-MiniLM-L-6-v3": ["122794", "150252", "585889", "411906", "269846", "523755", "479420", "93353", "375658"]}} +{"qid": "genQ2", "pos": ["454072"], "neg": {"msmarco-distilbert-base-v3": ["122794", "585889", "269846", "150252", "93353", "523755", "375658", "479420", "411906"], "msmarco-MiniLM-L-6-v3": ["122794", "150252", "269846", "411906", "585889", "523755", "93353", "479420", "375658"]}} +{"qid": "genQ3", "pos": ["454072"], "neg": {"msmarco-distilbert-base-v3": ["122794", "585889", "523755", "479420", "269846", "150252", "93353", "375658", "411906"], "msmarco-MiniLM-L-6-v3": ["122794", "150252", "585889", "411906", "269846", "523755", "479420", "93353", "375658"]}} +{"qid": "genQ4", "pos": ["150252"], "neg": {"msmarco-distilbert-base-v3": ["411906", "454072", "585889", "375658", "269846", "523755", "122794", "93353", "479420"], "msmarco-MiniLM-L-6-v3": ["411906", "585889", "454072", "122794", "375658", "479420", "269846", "93353", "523755"]}} +{"qid": "genQ5", "pos": ["150252"], "neg": {"msmarco-distilbert-base-v3": ["411906", "122794", "269846", "479420", "585889", "454072", "523755", "375658", "93353"], "msmarco-MiniLM-L-6-v3": ["411906", "122794", "523755", "479420", "375658", "585889", "93353", "269846", "454072"]}} +{"qid": "genQ6", "pos": ["150252"], "neg": {"msmarco-distilbert-base-v3": ["411906", "454072", "523755", "375658", "122794", "269846", "585889", "479420", "93353"], "msmarco-MiniLM-L-6-v3": ["411906", "585889", "454072", "269846", "523755", "479420", "122794", "375658", "93353"]}} +{"qid": "genQ7", "pos": ["375658"], "neg": {"msmarco-distilbert-base-v3": ["93353", "454072", "479420", "523755", "411906", "150252", "269846", "122794", "585889"], "msmarco-MiniLM-L-6-v3": ["93353", "454072", "523755", "411906", "150252", "122794", "585889", "269846", "479420"]}} +{"qid": "genQ8", "pos": ["375658"], "neg": {"msmarco-distilbert-base-v3": ["269846", "411906", "454072", "523755", "93353", "122794", "150252", "479420", "585889"], "msmarco-MiniLM-L-6-v3": ["454072", "93353", "523755", "411906", "269846", "150252", "122794", "585889", "479420"]}} +{"qid": "genQ9", "pos": ["375658"], "neg": {"msmarco-distilbert-base-v3": ["269846", "411906", "454072", "150252", "122794", "523755", "479420", "93353", "585889"], "msmarco-MiniLM-L-6-v3": ["454072", "269846", "122794", "523755", "93353", "411906", "150252", "585889", "479420"]}} +{"qid": "genQ10", "pos": ["93353"], "neg": {"msmarco-distilbert-base-v3": ["585889", "479420", "122794", "269846", "523755", "454072", "150252", "375658", "411906"], "msmarco-MiniLM-L-6-v3": ["585889", "375658", "269846", "454072", "479420", "122794", "523755", "150252", "411906"]}} +{"qid": "genQ11", "pos": ["93353"], "neg": {"msmarco-distilbert-base-v3": ["479420", "585889", "269846", "122794", "523755", "411906", "150252", "375658", "454072"], "msmarco-MiniLM-L-6-v3": ["585889", "375658", "269846", "479420", "122794", "523755", "454072", "411906", "150252"]}} +{"qid": "genQ12", "pos": ["93353"], "neg": {"msmarco-distilbert-base-v3": ["585889", "479420", "523755", "122794", "269846", "150252", "454072", "411906", "375658"], "msmarco-MiniLM-L-6-v3": ["585889", "269846", "479420", "375658", "122794", "523755", "150252", "454072", "411906"]}} +{"qid": "genQ13", "pos": ["411906"], "neg": {"msmarco-distilbert-base-v3": ["150252", "269846", "375658", "479420", "454072", "122794", "585889", "523755", "93353"], "msmarco-MiniLM-L-6-v3": ["150252", "122794", "479420", "585889", "269846", "454072", "375658", "93353", "523755"]}} +{"qid": "genQ14", "pos": ["411906"], "neg": {"msmarco-distilbert-base-v3": ["150252", "479420", "454072", "523755", "122794", "375658", "269846", "585889", "93353"], "msmarco-MiniLM-L-6-v3": ["150252", "479420", "122794", "523755", "269846", "454072", "585889", "375658", "93353"]}} +{"qid": "genQ15", "pos": ["411906"], "neg": {"msmarco-distilbert-base-v3": ["150252", "479420", "585889", "523755", "269846", "454072", "375658", "122794", "93353"], "msmarco-MiniLM-L-6-v3": ["150252", "585889", "122794", "523755", "479420", "269846", "454072", "93353", "375658"]}} +{"qid": "genQ16", "pos": ["269846"], "neg": {"msmarco-distilbert-base-v3": ["375658", "411906", "122794", "150252", "523755", "585889", "479420", "454072", "93353"], "msmarco-MiniLM-L-6-v3": ["479420", "523755", "411906", "585889", "122794", "375658", "454072", "150252", "93353"]}} +{"qid": "genQ17", "pos": ["269846"], "neg": {"msmarco-distilbert-base-v3": ["122794", "479420", "411906", "375658", "585889", "523755", "454072", "150252", "93353"], "msmarco-MiniLM-L-6-v3": ["411906", "479420", "585889", "122794", "454072", "523755", "150252", "93353", "375658"]}} +{"qid": "genQ18", "pos": ["269846"], "neg": {"msmarco-distilbert-base-v3": ["585889", "411906", "122794", "375658", "523755", "479420", "93353", "150252", "454072"], "msmarco-MiniLM-L-6-v3": ["523755", "122794", "479420", "375658", "585889", "93353", "454072", "411906", "150252"]}} +{"qid": "genQ19", "pos": ["523755"], "neg": {"msmarco-distilbert-base-v3": ["269846", "454072", "479420", "122794", "93353", "411906", "150252", "375658", "585889"], "msmarco-MiniLM-L-6-v3": ["479420", "269846", "122794", "454072", "411906", "375658", "585889", "150252", "93353"]}} +{"qid": "genQ20", "pos": ["523755"], "neg": {"msmarco-distilbert-base-v3": ["269846", "479420", "122794", "375658", "454072", "411906", "93353", "150252", "585889"], "msmarco-MiniLM-L-6-v3": ["269846", "479420", "122794", "585889", "150252", "454072", "411906", "93353", "375658"]}} +{"qid": "genQ21", "pos": ["523755"], "neg": {"msmarco-distilbert-base-v3": ["479420", "269846", "122794", "375658", "411906", "454072", "150252", "93353", "585889"], "msmarco-MiniLM-L-6-v3": ["479420", "269846", "122794", "585889", "150252", "411906", "375658", "454072", "93353"]}} +{"qid": "genQ22", "pos": ["122794"], "neg": {"msmarco-distilbert-base-v3": ["585889", "479420", "269846", "454072", "93353", "375658", "523755", "411906", "150252"], "msmarco-MiniLM-L-6-v3": ["454072", "479420", "93353", "375658", "585889", "269846", "523755", "411906", "150252"]}} +{"qid": "genQ23", "pos": ["122794"], "neg": {"msmarco-distilbert-base-v3": ["479420", "93353", "585889", "454072", "375658", "269846", "150252", "523755", "411906"], "msmarco-MiniLM-L-6-v3": ["479420", "454072", "411906", "585889", "269846", "523755", "93353", "375658", "150252"]}} +{"qid": "genQ24", "pos": ["122794"], "neg": {"msmarco-distilbert-base-v3": ["585889", "454072", "93353", "150252", "269846", "479420", "523755", "375658", "411906"], "msmarco-MiniLM-L-6-v3": ["411906", "454072", "479420", "93353", "375658", "585889", "269846", "150252", "523755"]}} +{"qid": "genQ25", "pos": ["479420"], "neg": {"msmarco-distilbert-base-v3": ["122794", "454072", "411906", "523755", "585889", "269846", "93353", "375658", "150252"], "msmarco-MiniLM-L-6-v3": ["122794", "454072", "523755", "93353", "411906", "375658", "150252", "269846", "585889"]}} +{"qid": "genQ26", "pos": ["479420"], "neg": {"msmarco-distilbert-base-v3": ["122794", "585889", "269846", "411906", "150252", "375658", "454072", "523755", "93353"], "msmarco-MiniLM-L-6-v3": ["122794", "454072", "523755", "375658", "269846", "411906", "585889", "150252", "93353"]}} +{"qid": "genQ27", "pos": ["479420"], "neg": {"msmarco-distilbert-base-v3": ["122794", "269846", "375658", "523755", "454072", "411906", "93353", "585889", "150252"], "msmarco-MiniLM-L-6-v3": ["523755", "122794", "454072", "269846", "411906", "93353", "375658", "150252", "585889"]}} +{"qid": "genQ28", "pos": ["585889"], "neg": {"msmarco-distilbert-base-v3": ["454072", "122794", "375658", "93353", "150252", "269846", "479420", "411906", "523755"], "msmarco-MiniLM-L-6-v3": ["93353", "454072", "122794", "375658", "150252", "269846", "479420", "523755", "411906"]}} +{"qid": "genQ29", "pos": ["585889"], "neg": {"msmarco-distilbert-base-v3": ["93353", "122794", "150252", "454072", "375658", "479420", "523755", "269846", "411906"], "msmarco-MiniLM-L-6-v3": ["122794", "93353", "150252", "454072", "479420", "269846", "411906", "523755", "375658"]}} +{"qid": "genQ30", "pos": ["585889"], "neg": {"msmarco-distilbert-base-v3": ["122794", "93353", "454072", "375658", "269846", "523755", "150252", "411906", "479420"], "msmarco-MiniLM-L-6-v3": ["454072", "411906", "150252", "93353", "269846", "122794", "523755", "479420", "375658"]}} diff --git a/sample-data/generated/fiqa/qgen-qrels/train.tsv b/sample-data/generated/fiqa/qgen-qrels/train.tsv index 82040c3..0858aad 100644 --- a/sample-data/generated/fiqa/qgen-qrels/train.tsv +++ b/sample-data/generated/fiqa/qgen-qrels/train.tsv @@ -1,31 +1,31 @@ query-id corpus-id score -genQ1 460230 1 -genQ2 460230 1 -genQ3 460230 1 -genQ4 257122 1 -genQ5 257122 1 -genQ6 257122 1 -genQ7 120306 1 -genQ8 120306 1 -genQ9 120306 1 -genQ10 35856 1 -genQ11 35856 1 -genQ12 35856 1 -genQ13 214079 1 -genQ14 214079 1 -genQ15 214079 1 -genQ16 316535 1 -genQ17 316535 1 -genQ18 316535 1 -genQ19 511432 1 -genQ20 511432 1 -genQ21 511432 1 -genQ22 78297 1 -genQ23 78297 1 -genQ24 78297 1 -genQ25 182744 1 -genQ26 182744 1 -genQ27 182744 1 -genQ28 281423 1 -genQ29 281423 1 -genQ30 281423 1 +genQ1 454072 1 +genQ2 454072 1 +genQ3 454072 1 +genQ4 150252 1 +genQ5 150252 1 +genQ6 150252 1 +genQ7 375658 1 +genQ8 375658 1 +genQ9 375658 1 +genQ10 93353 1 +genQ11 93353 1 +genQ12 93353 1 +genQ13 411906 1 +genQ14 411906 1 +genQ15 411906 1 +genQ16 269846 1 +genQ17 269846 1 +genQ18 269846 1 +genQ19 523755 1 +genQ20 523755 1 +genQ21 523755 1 +genQ22 122794 1 +genQ23 122794 1 +genQ24 122794 1 +genQ25 479420 1 +genQ26 479420 1 +genQ27 479420 1 +genQ28 585889 1 +genQ29 585889 1 +genQ30 585889 1 diff --git a/sample-data/generated/fiqa/qgen-queries.jsonl b/sample-data/generated/fiqa/qgen-queries.jsonl index b18d489..c831a11 100644 --- a/sample-data/generated/fiqa/qgen-queries.jsonl +++ b/sample-data/generated/fiqa/qgen-queries.jsonl @@ -1,30 +1,30 @@ -{"_id": "genQ1", "text": "what is the difference between large cap companies and smaller cap", "metadata": {}} -{"_id": "genQ2", "text": "what makes a company the best", "metadata": {}} -{"_id": "genQ3", "text": "what is the difference between big cap and small cap", "metadata": {}} -{"_id": "genQ4", "text": "is it better to turn solar or coal to renewable energy", "metadata": {}} -{"_id": "genQ5", "text": "do solar panels stop subsidies to oil", "metadata": {}} -{"_id": "genQ6", "text": "what year did power africa start", "metadata": {}} -{"_id": "genQ7", "text": "which taxes were highly regressive", "metadata": {}} -{"_id": "genQ8", "text": "how were income tax rates higher in the 1970s", "metadata": {}} -{"_id": "genQ9", "text": "when were state sales taxes regressive", "metadata": {}} -{"_id": "genQ10", "text": "why do you hire people", "metadata": {}} -{"_id": "genQ11", "text": "what do you think the problem is?", "metadata": {}} -{"_id": "genQ12", "text": "what do you think the problem is", "metadata": {}} -{"_id": "genQ13", "text": "what to say with flowers in calgary", "metadata": {}} -{"_id": "genQ14", "text": "what is flowers?", "metadata": {}} -{"_id": "genQ15", "text": "what is the best flower to say", "metadata": {}} -{"_id": "genQ16", "text": "how long does it take for a loan to be approved", "metadata": {}} -{"_id": "genQ17", "text": "how long do you have to wait before sending money", "metadata": {}} -{"_id": "genQ18", "text": "how long does it take for loan to close if you have a position on loan club", "metadata": {}} -{"_id": "genQ19", "text": "what does it mean if you move money to australia and lock in rate?", "metadata": {}} -{"_id": "genQ20", "text": "if i move to australia how long will it take to be in us", "metadata": {}} -{"_id": "genQ21", "text": "can you lock in exchange rates", "metadata": {}} -{"_id": "genQ22", "text": "what is the swift code for paypal", "metadata": {}} -{"_id": "genQ23", "text": "paypal credit card credit card swift code", "metadata": {}} -{"_id": "genQ24", "text": "what is paypal swift code", "metadata": {}} -{"_id": "genQ25", "text": "who never said no money no healthcare", "metadata": {}} -{"_id": "genQ26", "text": "who said no money no healthcare", "metadata": {}} -{"_id": "genQ27", "text": "what mr pence said about trumpcare", "metadata": {}} -{"_id": "genQ28", "text": "how much does foliofn charge for a stock", "metadata": {}} -{"_id": "genQ29", "text": "how much does foliofn pay a year", "metadata": {}} -{"_id": "genQ30", "text": "how much does foliofn cost", "metadata": {}} +{"_id": "genQ1", "text": "convert apr to effective rate", "metadata": {}} +{"_id": "genQ2", "text": "is the apr an effective rate", "metadata": {}} +{"_id": "genQ3", "text": "how to convert apr rate into effective rate", "metadata": {}} +{"_id": "genQ4", "text": "how many months can you intern for paid", "metadata": {}} +{"_id": "genQ5", "text": "if you are a student can you get internships", "metadata": {}} +{"_id": "genQ6", "text": "how long does it take to get paid intern position in usa", "metadata": {}} +{"_id": "genQ7", "text": "what is a basic ass cellphone", "metadata": {}} +{"_id": "genQ8", "text": "can you get a basic ass phone", "metadata": {}} +{"_id": "genQ9", "text": "are cell phones cheap", "metadata": {}} +{"_id": "genQ10", "text": "why is religion protected", "metadata": {}} +{"_id": "genQ11", "text": "why is religion not protected", "metadata": {}} +{"_id": "genQ12", "text": "does religion really need to be protected", "metadata": {}} +{"_id": "genQ13", "text": "can a vc internship be paid", "metadata": {}} +{"_id": "genQ14", "text": "how to get an internship at vc", "metadata": {}} +{"_id": "genQ15", "text": "do i need an internship to be an entrepreneur", "metadata": {}} +{"_id": "genQ16", "text": "do factories lose their jobs", "metadata": {}} +{"_id": "genQ17", "text": "why was it important to lose manufacturing", "metadata": {}} +{"_id": "genQ18", "text": "why did ross perot argue it did not matter to manufacturers", "metadata": {}} +{"_id": "genQ19", "text": "what is the best online shopping company", "metadata": {}} +{"_id": "genQ20", "text": "what is the best buy?", "metadata": {}} +{"_id": "genQ21", "text": "where is best buy", "metadata": {}} +{"_id": "genQ22", "text": "why does group resp differ from pooled", "metadata": {}} +{"_id": "genQ23", "text": "are group esps transferable", "metadata": {}} +{"_id": "genQ24", "text": "difference between group esp and rrsp", "metadata": {}} +{"_id": "genQ25", "text": "what is etf and mutual fund trading", "metadata": {}} +{"_id": "genQ26", "text": "how does mutual funds work", "metadata": {}} +{"_id": "genQ27", "text": "which etf has no track of stock price", "metadata": {}} +{"_id": "genQ28", "text": "what is pre tax contribution limitation", "metadata": {}} +{"_id": "genQ29", "text": "what is the legal age limit for erisa", "metadata": {}} +{"_id": "genQ30", "text": "erisa hr department responsibilities", "metadata": {}} diff --git a/setup.py b/setup.py index df3ee9f..d5c1067 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="gpl", - version="0.0.8", + version="0.0.9", author="Kexin Wang", author_email="kexin.wang.2049@gmail.com", description="GPL is an unsupervised domain adaptation method for training dense retrievers. It is based on query generation and pseudo labeling with powerful cross-encoders. To train a domain-adapted model, it needs only the unlabeled target corpus and can achieve significant improvement over zero-shot models.",